diff --git a/include/llama.h b/include/llama.h index f23355a6b..66c266298 100644 --- a/include/llama.h +++ b/include/llama.h @@ -345,7 +345,7 @@ extern "C" { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // itoken embeddings tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored