check C++ code with -Wmissing-declarations (#3184)

This commit is contained in:
Cebtenzzre 2023-09-15 15:38:27 -04:00 committed by GitHub
parent 69eb67e282
commit 3aefaab9e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 247 additions and 243 deletions

View File

@ -427,6 +427,7 @@ if (LLAMA_ALL_WARNINGS)
-Wextra -Wextra
-Wpedantic -Wpedantic
-Wcast-qual -Wcast-qual
-Wmissing-declarations
-Wno-unused-function -Wno-unused-function
-Wno-multichar -Wno-multichar
) )

View File

@ -172,9 +172,16 @@ endif # LLAMA_DISABLE_LOGS
# warnings # warnings
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang,$(shell $(CXX) --version))' # TODO(cebtenzzre): remove this once PR #2632 gets merged
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
# clang++ only
MK_CXXFLAGS += -Wmissing-prototypes
TTFS_CXXFLAGS += -Wno-missing-prototypes
else
# g++ only # g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif endif
@ -524,7 +531,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View File

@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
} }
void process_escapes(std::string& input) { static void process_escapes(std::string& input) {
std::size_t input_len = input.length(); std::size_t input_len = input.length();
std::size_t output_idx = 0; std::size_t output_idx = 0;

View File

@ -158,7 +158,7 @@ namespace console {
} }
} }
char32_t getchar32() { static char32_t getchar32() {
#if defined(_WIN32) #if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0; wchar_t high_surrogate = 0;
@ -212,7 +212,7 @@ namespace console {
#endif #endif
} }
void pop_cursor() { static void pop_cursor() {
#if defined(_WIN32) #if defined(_WIN32)
if (hConsole != NULL) { if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
@ -233,7 +233,7 @@ namespace console {
putc('\b', out); putc('\b', out);
} }
int estimateWidth(char32_t codepoint) { static int estimateWidth(char32_t codepoint) {
#if defined(_WIN32) #if defined(_WIN32)
(void)codepoint; (void)codepoint;
return 1; return 1;
@ -242,7 +242,7 @@ namespace console {
#endif #endif
} }
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32) #if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
@ -303,7 +303,7 @@ namespace console {
#endif #endif
} }
void replace_last(char ch) { static void replace_last(char ch) {
#if defined(_WIN32) #if defined(_WIN32)
pop_cursor(); pop_cursor();
put_codepoint(&ch, 1, 1); put_codepoint(&ch, 1, 1);
@ -312,7 +312,7 @@ namespace console {
#endif #endif
} }
void append_utf8(char32_t ch, std::string & out) { static void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) { if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch)); out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) { } else if (ch <= 0x7FF) {
@ -333,7 +333,7 @@ namespace console {
} }
// Helper function to remove the last UTF-8 character from a string // Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) { static void pop_back_utf8_char(std::string & line) {
if (line.empty()) { if (line.empty()) {
return; return;
} }
@ -349,7 +349,7 @@ namespace console {
line.erase(pos); line.erase(pos);
} }
bool readline_advanced(std::string & line, bool multiline_input) { static bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) { if (out != stdout) {
fflush(stdout); fflush(stdout);
} }
@ -452,7 +452,7 @@ namespace console {
return has_more; return has_more;
} }
bool readline_simple(std::string & line, bool multiline_input) { static bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32) #if defined(_WIN32)
std::wstring wline; std::wstring wline;
if (!std::getline(std::wcin, wline)) { if (!std::getline(std::wcin, wline)) {

View File

@ -9,7 +9,7 @@
namespace grammar_parser { namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun) // NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp // copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) { static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src); uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4; uint8_t highbits = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second; return result.first->second;
} }
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id; return next_id;
} }
void add_rule( static void add_rule(
parse_state & state, parse_state & state,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) { const std::vector<llama_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
state.rules[rule_id] = rule; state.rules[rule_id] = rule;
} }
bool is_word_char(char c) { static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
} }
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src; const char * pos = src;
const char * end = src + size; const char * end = src + size;
uint32_t value = 0; uint32_t value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
const char * parse_space(const char * src, bool newline_ok) { static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src; const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' || while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) { (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_name(const char * src) { static const char * parse_name(const char * src) {
const char * pos = src; const char * pos = src;
while (is_word_char(*pos)) { while (is_word_char(*pos)) {
pos++; pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
return pos; return pos;
} }
std::pair<uint32_t, const char *> parse_char(const char * src) { static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') { if (*src == '\\') {
switch (src[1]) { switch (src[1]) {
case 'x': return parse_hex(src + 2, 2); case 'x': return parse_hex(src + 2, 2);
@ -129,7 +129,7 @@ namespace grammar_parser {
uint32_t rule_id, uint32_t rule_id,
bool is_nested); bool is_nested);
const char * parse_sequence( static const char * parse_sequence(
parse_state & state, parse_state & state,
const char * src, const char * src,
const std::string & rule_name, const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_rule(parse_state & state, const char * src) { static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src); const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false); const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src; size_t name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
} }
} }
void print_grammar_char(FILE * file, uint32_t c) { static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) { if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c)); fprintf(file, "%c", static_cast<char>(c));
} else { } else {
@ -294,7 +294,7 @@ namespace grammar_parser {
} }
} }
bool is_char_element(llama_grammar_element elem) { static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true; case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true; case LLAMA_GRETYPE_CHAR_NOT: return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
} }
} }
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) { static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) { for (auto elem : rule) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break; case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
@ -334,7 +334,7 @@ namespace grammar_parser {
fprintf(file, "\n"); fprintf(file, "\n");
} }
void print_rule( static void print_rule(
FILE * file, FILE * file,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule, const std::vector<llama_grammar_element> & rule,

View File

@ -9,12 +9,12 @@
#endif #endif
#ifdef LLAMA_DEFAULT_RMS_EPS #ifdef LLAMA_DEFAULT_RMS_EPS
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
#else #else
static const float rms_norm_eps = 5e-6f; constexpr float rms_norm_eps = 5e-6f;
#endif #endif
float frand() { static float frand() {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
@ -25,19 +25,21 @@ struct random_normal_distribution {
float max; float max;
}; };
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { static void init_random_normal_distribution(
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
) {
rnd->gen = std::mt19937(seed); rnd->gen = std::mt19937(seed);
rnd->nd = std::normal_distribution<float>{mean, std}; rnd->nd = std::normal_distribution<float>{mean, std};
rnd->min = min; rnd->min = min;
rnd->max = max; rnd->max = max;
} }
float frand_normal(struct random_normal_distribution * rnd) { static float frand_normal(struct random_normal_distribution * rnd) {
const float r = rnd->nd(rnd->gen); const float r = rnd->nd(rnd->gen);
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
} }
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) { if (plan.work_size > 0) {
@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan); ggml_graph_compute(graph, &plan);
} }
struct ggml_tensor * randomize_tensor( static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
int ndims, ) {
const int64_t ne[],
float fmin,
float fmax) {
switch (ndims) { switch (ndims) {
case 1: case 1:
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
return tensor; return tensor;
} }
struct ggml_tensor * randomize_tensor_normal( static struct ggml_tensor * randomize_tensor_normal(
struct ggml_tensor * tensor, struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
int ndims, ) {
const int64_t ne[],
struct random_normal_distribution * rnd) {
float scale = 1.0; // xavier float scale = 1.0; // xavier
switch (ndims) { switch (ndims) {
case 1: case 1:
@ -159,7 +155,7 @@ struct llama_hparams {
} }
}; };
uint32_t get_n_ff(const struct llama_hparams* hparams) { static uint32_t get_n_ff(const struct llama_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff; return n_ff;
} }
@ -260,7 +256,7 @@ struct llama_model_lora {
std::vector<llama_layer_lora> layers; std::vector<llama_layer_lora> layers;
}; };
void init_model(struct llama_model * model) { static void init_model(struct llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
} }
void init_model_lora(struct llama_model_lora * model) { static void init_model_lora(struct llama_model_lora * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
} }
} }
void set_param_model(struct llama_model * model) { static void set_param_model(struct llama_model * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
} }
} }
void set_param_model_lora(struct llama_model_lora * model) { static void set_param_model_lora(struct llama_model_lora * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
} }
} }
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
} }
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { static void randomize_model_lora(
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
} }
} }
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
return true; return true;
} }
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
return true; return true;
} }
struct ggml_tensor * forward( static struct ggml_tensor * forward(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past) { const int n_past
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -756,25 +754,25 @@ struct ggml_tensor * forward(
return inpL; return inpL;
} }
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->n_dims == 1);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
} }
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->n_dims == 2);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
} }
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->n_dims == 3);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
} }
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->n_dims == 4);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
GGML_ASSERT(tensor->ne[3] == ne3); GGML_ASSERT(tensor->ne[3] == ne3);
} }
struct ggml_tensor * forward_batch( static struct ggml_tensor * forward_batch(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past, const int n_past,
const int n_batch) { const int n_batch
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
return inpL; return inpL;
} }
static struct ggml_tensor * forward_lora(
struct ggml_tensor * forward_lora( struct llama_model_lora * model,
struct llama_model_lora * model, struct llama_kv_cache * cache,
struct llama_kv_cache * cache, struct ggml_context * ctx0,
struct ggml_context * ctx0, struct ggml_cgraph * gf,
struct ggml_cgraph * gf, struct ggml_tensor * tokens_input,
struct ggml_tensor * tokens_input, const int n_tokens,
const int n_tokens, const int n_past
const int n_past) { ) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
return inpL; return inpL;
} }
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2); assert(logits->n_dims == 2);
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
assert(best_samples->n_dims == 1); assert(best_samples->n_dims == 1);
@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
} }
} }
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples
) {
GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(best_samples->n_dims == 2);
GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(logits->n_dims == 3);
GGML_ASSERT(probs->n_dims == 3); GGML_ASSERT(probs->n_dims == 3);
@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
} }
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
printf(" %.2f", p); printf(" %.2f", p);
@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
} }
} }
void print_token(int token, int n_vocab) { static void print_token(int token, int n_vocab) {
for (int k = 0; k < token; ++k) { for (int k = 0; k < token; ++k) {
printf(" "); printf(" ");
} }
@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
printf("\n"); printf("\n");
} }
void print_tokens(struct ggml_tensor * tokens, int n_vocab) { static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
for (int i=0; i<tokens->ne[0]; ++i) { for (int i=0; i<tokens->ne[0]; ++i) {
int token = ggml_get_i32_1d(tokens, i); int token = ggml_get_i32_1d(tokens, i);
print_token(token, n_vocab); print_token(token, n_vocab);
} }
} }
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
float randomness = 0.0f; float randomness = 0.0f;
@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
} }
} }
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT( targets->n_dims == 3); GGML_ASSERT( targets->n_dims == 3);
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
} }
} }
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
for (int i=0; i<n_tokens-n_shift; ++i) { for (int i=0; i<n_tokens-n_shift; ++i) {
@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
} }
} }
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * square_error_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
// todo: instead of a-b: a[1:]-b[:-1] // todo: instead of a-b: a[1:]-b[:-1]
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b))); return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
} }
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * cross_entropy_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
const float eps = 1e-3f; const float eps = 1e-3f;
return return
ggml_sum(ctx, ggml_sum(ctx,

View File

@ -30,7 +30,8 @@ struct ostream_beam_view {
llama_context * ctx; llama_context * ctx;
llama_beam_view beam_view; llama_beam_view beam_view;
}; };
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@ -46,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
} }
@ -56,7 +57,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {

View File

@ -115,7 +115,7 @@ struct TransformerWeights {
} }
}; };
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim](); w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
} }
} }
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0; return 0;
} }
void print_sample_weights(TransformerWeights *w){ static void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n"); printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]); printf("%f\n", w->rms_att_weight[0]);
@ -324,7 +324,7 @@ struct train_params {
int mem_compute1_gb; int mem_compute1_gb;
}; };
void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_embd: %d\n", __func__, params->n_embd);
@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
printf("%s: n_rot: %d\n", __func__, params->n_rot); printf("%s: n_rot: %d\n", __func__, params->n_rot);
} }
void init_model(struct my_llama_model * model) { static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
} }
} }
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i); float p = get_f32_2d(probs, k, i);
printf(" %f", p); printf(" %f", p);
@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -531,7 +531,7 @@ struct llama_file {
} }
}; };
bool is_ggml_file(const char *filename) { static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (file.size < 4) { if (file.size < 4) {
return false; return false;
@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
return magic == GGUF_MAGIC; return magic == GGUF_MAGIC;
} }
static std::string llama_escape_whitespaces(const std::string& text) { static std::string llama_escape_whitespaces(const std::string & text) {
std::ostringstream out; std::ostringstream out;
for (char c : text) { for (char c : text) {
if (c == ' ') out << "\xe2\x96\x81"; if (c == ' ') out << "\xe2\x96\x81";
@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
return out.str(); return out.str();
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
} }
} }
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (gg_weights->n_dims){
case 1: case 1:
@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
} }
} }
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one. // convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings // w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor // float* -> struct ggml_tensor
@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
gguf_free(ctx); gguf_free(ctx);
} }
struct train_params get_default_train_params() { static struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
return params; return params;
} }
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
bool params_parse(int argc, char ** argv, struct train_params * params) { static bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false; bool invalid_param = false;
bool reqd_param_found = false; bool reqd_param_found = false;
std::string arg; std::string arg;
@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
return true; return true;
} }
std::string basename(const std::string &path) { static std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\"); size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) { if (pos == std::string::npos) {
return path; return path;

View File

@ -13,14 +13,14 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
template<typename T> template <typename T>
static std::string to_string(const T & val) { static std::string to_string(const T & val) {
std::stringstream ss; std::stringstream ss;
ss << val; ss << val;
return ss.str(); return ss.str();
} }
bool gguf_ex_write(const std::string & fname) { static bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty(); struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
} }
// just read tensor info // just read tensor info
bool gguf_ex_read_0(const std::string & fname) { static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ false, /*.no_alloc = */ false,
/*.ctx = */ NULL, /*.ctx = */ NULL,
@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) { static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {

View File

@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens; static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
void write_logfile(
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens const std::vector<llama_token> & output_tokens
@ -86,7 +87,7 @@ void write_logfile(
} }
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) { static void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting) {
is_interacting = true; is_interacting = true;

View File

@ -28,9 +28,10 @@ struct results_log_softmax {
float prob; float prob;
}; };
void write_logfile(const llama_context * ctx, const gpt_params & params, static void write_logfile(
const llama_model * model, const struct results_perplexity & results) { const llama_context * ctx, const gpt_params & params, const llama_model * model,
const struct results_perplexity & results
) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
return; return;
} }
@ -76,7 +77,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
fclose(logfile); fclose(logfile);
} }
std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v); for (float v : logits) max_logit = std::max(max_logit, v);
@ -92,7 +93,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0]; float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0; double sum_exp = 0.0;
@ -100,9 +101,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, static void process_logits(
double & nll, double & nll2, float * logit_history, float * prob_history) { int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -130,7 +132,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
} }
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
@ -260,8 +262,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
return {tokens, std::exp(nll / count), logit_history, prob_history}; return {tokens, std::exp(nll / count), logit_history, prob_history};
} }
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params); return perplexity_v2(ctx, params);
} }
@ -400,8 +401,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
return {tokens, ppl, logit_history, prob_history}; return {tokens, ppl, logit_history, prob_history};
} }
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, static std::vector<float> hellaswag_evaluate_tokens(
int n_vocab, int n_thread) { llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
) {
std::vector<float> result; std::vector<float> result;
result.reserve(tokens.size() * n_vocab); result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@ -421,7 +423,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
return result; return result;
} }
void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl

View File

@ -34,8 +34,8 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types; std::vector<enum ggml_type> include_types;
}; };
const size_t HISTOGRAM_BUCKETS = 150; constexpr size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03; constexpr double HISTOGRAM_RANGE = 0.03;
struct error_stats { struct error_stats {
size_t num_samples; size_t num_samples;
@ -44,8 +44,7 @@ struct error_stats {
uint64_t error_histogram[HISTOGRAM_BUCKETS]; uint64_t error_histogram[HISTOGRAM_BUCKETS];
}; };
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params; quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
} }
// Check if a layer is included/excluded by command line // Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params & params, const std::string & layer) { static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) { for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) { if (std::regex_search(layer, std::regex(excluded))) {
return false; return false;
@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
} }
// Update error statistics given vectors with the before/after result of quantization // Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) { for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i]; double diff = input[i] - output[i];
stats.total_error += diff * diff; stats.total_error += diff * diff;
@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
stats.num_samples += nelements; stats.num_samples += nelements;
} }
void combine_error_stats(error_stats & into, const error_stats & from) { static void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples; into.num_samples += from.num_samples;
into.total_error += from.total_error; into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error; if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
} }
double find_quantile(const error_stats & stats, double quantile) { static double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0; double accum = 0;
@ -116,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY; return INFINITY;
} }
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples); double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5); double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95); double pct95 = find_quantile(stats, .95);
@ -143,17 +142,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
void test_roundtrip_on_chunk( static void test_roundtrip_on_chunk(
const ggml_tensor * layer, const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
int64_t offset, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
int64_t chunk_size, ) {
const ggml_type_traits_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {
if (layer->type == GGML_TYPE_F16) { if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) { for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset); input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@ -174,18 +166,11 @@ void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats // Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer( static void test_roundtrip_on_layer(
std::string & name, std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
bool print_layer_stats, const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
const ggml_type_traits_t & qfns, std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
bool use_reference, ) {
const ggml_tensor * layer,
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {
assert(tensor_is_contiguous(layer)); assert(tensor_is_contiguous(layer));
error_stats layer_error {}; error_stats layer_error {};
uint64_t nelements = ggml_nelements(layer); uint64_t nelements = ggml_nelements(layer);

View File

@ -40,7 +40,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
}; };
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
for (auto ch : ftype_str_in) { for (auto ch : ftype_str_in) {
@ -72,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");

View File

@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string
return res; return res;
} }
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs) static json format_partial_response(
{ llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
) {
json res = json{ json res = json{
{"content", content}, {"content", content},
{"stop", false}, {"stop", false},
@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res)
}); });
} }
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
} }
@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) { static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data); auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@ -1258,7 +1259,8 @@ struct token_translator {
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
}; };
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
{
auto & gtps = llama.generated_token_probs; auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx}; auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };

View File

@ -1,3 +1,4 @@
#define LLAMA_API_INTERNAL
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -108,7 +109,7 @@ static size_t utf8_len(char src) {
return lookup[highbits]; return lookup[highbits];
} }
void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result; std::string result;
for (size_t pos = 0; ; pos += search.length()) { for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos); auto new_pos = s.find(search, pos);
@ -1589,7 +1590,7 @@ struct llama_model_loader {
// load LLaMA models // load LLaMA models
// //
std::string llama_model_ftype_name(enum llama_ftype ftype) { static std::string llama_model_ftype_name(enum llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) { if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
} }
@ -4295,7 +4296,7 @@ struct llama_grammar_candidate {
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const char * src, const char * src,
llama_partial_utf8 partial_start) { llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@ -5893,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
// TODO: after the GGUF PR, this likely won't work and needs to be updated // TODO: after the GGUF PR, this likely won't work and needs to be updated
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { static int llama_apply_lora_from_file_internal(
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
) {
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
@ -6440,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
return ctx; return ctx;
} }
struct llama_context * llama_init_from_file( static struct llama_context * llama_init_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params) { struct llama_context_params params) {
struct llama_model * model = llama_load_model_from_file(path_model, params); struct llama_model * model = llama_load_model_from_file(path_model, params);
@ -6645,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
* llama_copy_state_data(ctx, &data_ctx); * llama_copy_state_data(ctx, &data_ctx);
* *
*/ */
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
// copy rng // copy rng
{ {
std::stringstream rng_ss; std::stringstream rng_ss;
@ -7183,7 +7186,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
} }
// For internal test use // For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name; return ctx->model.tensors_by_name;
} }

View File

@ -540,7 +540,9 @@ extern "C" {
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL

View File

@ -16,7 +16,7 @@
constexpr int kVecSize = 1 << 18; constexpr int kVecSize = 1 << 18;
float drawFromGaussianPdf(std::mt19937& rndm) { static float drawFromGaussianPdf(std::mt19937& rndm) {
constexpr double kScale = 1./(1. + std::mt19937::max()); constexpr double kScale = 1./(1. + std::mt19937::max());
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
static float lastX; static float lastX;
@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) {
haveX = true; haveX = true;
return r*cos(phi); return r*cos(phi);
} }
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
} }

View File

@ -36,15 +36,15 @@
#define GGML_PRINT(...) printf(__VA_ARGS__) #define GGML_PRINT(...) printf(__VA_ARGS__)
float frand(void) { static float frand(void) {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
int irand(int n) { static int irand(int n) {
return rand()%n; return rand()%n;
} }
void get_random_dims(int64_t * dims, int ndims) { static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) {
} }
} }
void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
} }
struct ggml_tensor * get_random_tensor( static struct ggml_tensor * get_random_tensor(
struct ggml_context * ctx0, struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
int ndims, ) {
int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) { switch (ndims) {
@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor(
return result; return result;
} }
float get_element(const struct ggml_tensor * t, int idx) { static float get_element(const struct ggml_tensor * t, int idx) {
return ((float *)t->data)[idx]; return ((float *)t->data)[idx];
} }
void set_element(struct ggml_tensor * t, int idx, float value) { static void set_element(struct ggml_tensor * t, int idx, float value) {
((float *)t->data)[idx] = value; ((float *)t->data)[idx] = value;
} }

View File

@ -13,24 +13,24 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
const float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
const char* RESULT_STR[] = {"ok", "FAILED"}; static const char* RESULT_STR[] = {"ok", "FAILED"};
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
// Calculate RMSE between two float arrays // Calculate RMSE between two float arrays
float array_rmse(const float * a1, const float * a2, size_t n) { static float array_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i]; double diff = a1[i] - a2[i];
@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
} }
// Total quantization error on test data // Total quantization error on test data
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons
} }
// Total quantization error on test data // Total quantization error on test data
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size); std::vector<float> tmp_out_ref(test_size);
@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size,
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
} }
float dot_product(const float * a1, const float * a2, size_t test_size) { static float dot_product(const float * a1, const float * a2, size_t test_size) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < test_size; i++) { for (size_t i = 0; i < test_size; i++) {
sum += a1[i] * a2[i]; sum += a1[i] * a2[i];
@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
} }
// Total dot product error // Total dot product error
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size); std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size); std::vector<uint8_t> tmp_q2(2*test_size);

View File

@ -61,22 +61,22 @@ inline int64_t cpu_cycles() {
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
float gigabytes_per_second(size_t bytes, int64_t usecs) { static float gigabytes_per_second(size_t bytes, int64_t usecs) {
return bytes / (float) usecs * 1000000 / (1024*1024*1024); return bytes / (float) usecs * 1000000 / (1024*1024*1024);
} }
void * align_with_offset(void * ptr, int offset) { static void * align_with_offset(void * ptr, int offset) {
size_t dummy_size = MAX_ALIGNMENT * 4; size_t dummy_size = MAX_ALIGNMENT * 4;
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
} }
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) { static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX; int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0; int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX; int64_t min_time_cycles = INT64_MAX;
@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
} }
void usage(char * argv[]) { static void usage(char * argv[]) {
printf("Benchmark quantization specific functions on synthetic data\n"); printf("Benchmark quantization specific functions on synthetic data\n");
printf("\n"); printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);

View File

@ -12,7 +12,8 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
void dump(const llama_token_data_array * candidates) {
static void dump(const llama_token_data_array * candidates) {
for (size_t i = 0; i < candidates->size; i++) { for (size_t i = 0; i < candidates->size; i++) {
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
} }
@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) {
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
void test_top_k(const std::vector<float> & probs, static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
const std::vector<float> & expected_probs,
int k) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -45,10 +44,7 @@ void test_top_k(const std::vector<float> & probs,
} }
void test_top_p(const std::vector<float> & probs, static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -70,9 +66,7 @@ void test_top_p(const std::vector<float> & probs,
} }
void test_tfs(const std::vector<float> & probs, static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
const std::vector<float> & expected_probs,
float z) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -93,9 +87,7 @@ void test_tfs(const std::vector<float> & probs,
} }
void test_typical(const std::vector<float> & probs, static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -116,11 +108,10 @@ void test_typical(const std::vector<float> & probs,
} }
void test_repetition_penalty( static void test_repetition_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float penalty
const std::vector<float> & expected_probs, ) {
float penalty) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
@ -145,11 +136,10 @@ void test_repetition_penalty(
} }
void test_frequency_presence_penalty( static void test_frequency_presence_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
const std::vector<float> & expected_probs, ) {
float alpha_frequency, float alpha_presence) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();

View File

@ -13,7 +13,7 @@
typedef int codepoint; typedef int codepoint;
std::string codepoint_to_utf8(codepoint cp) { static std::string codepoint_to_utf8(codepoint cp) {
std::string result; std::string result;
if (0x00 <= cp && cp <= 0x7f) { if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp); result.push_back(cp);