2023-04-11 13:19:54 +00:00
// Defines fileno on msys:
# ifndef _GNU_SOURCE
# define _GNU_SOURCE
2023-04-17 15:28:55 +00:00
# include <cstdint>
# include <cstdio>
2023-04-11 13:19:54 +00:00
# endif
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
# include "llama_util.h"
2023-03-22 05:32:36 +00:00
# include "llama.h"
# include "ggml.h"
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
# include <array>
2023-04-16 09:13:42 +00:00
# include <ctime>
2023-03-22 05:32:36 +00:00
# include <cinttypes>
# include <fstream>
# include <random>
2023-03-24 21:17:37 +00:00
# include <map>
2023-03-22 05:32:36 +00:00
# include <unordered_map>
# include <queue>
# include <cassert>
2023-03-22 08:55:45 +00:00
# include <cstring>
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
# include <climits>
# include <memory>
# include <algorithm>
# include <initializer_list>
2023-04-20 17:42:27 +00:00
# include <thread>
# include <atomic>
# include <mutex>
2023-04-22 06:21:32 +00:00
# include <sstream>
2023-03-29 20:51:37 +00:00
2023-03-24 21:17:37 +00:00
# define LLAMA_USE_SCRATCH
# define LLAMA_MAX_SCRATCH_BUFFERS 16
2023-03-22 05:32:36 +00:00
2023-03-24 21:17:37 +00:00
// available llama models
enum e_model {
MODEL_UNKNOWN ,
MODEL_7B ,
MODEL_13B ,
MODEL_30B ,
MODEL_65B ,
} ;
static const size_t MB = 1024 * 1024 ;
// computed for n_ctx == 2048
// TODO: dynamically determine these sizes
// needs modifications in ggml
2023-04-17 14:41:53 +00:00
static const std : : map < e_model , size_t > & MEM_REQ_SCRATCH0 ( )
{
static std : : map < e_model , size_t > _MEM_REQ_SCRATCH0 = {
{ MODEL_7B , 512ull * MB } ,
{ MODEL_13B , 512ull * MB } ,
{ MODEL_30B , 512ull * MB } ,
2023-04-24 15:47:03 +00:00
{ MODEL_65B , 1024ull * MB } ,
2023-04-17 14:41:53 +00:00
} ;
return _MEM_REQ_SCRATCH0 ;
}
2023-03-24 21:17:37 +00:00
2023-04-17 14:41:53 +00:00
static const std : : map < e_model , size_t > & MEM_REQ_SCRATCH1 ( )
{
static std : : map < e_model , size_t > _MEM_REQ_SCRATCH1 = {
{ MODEL_7B , 512ull * MB } ,
{ MODEL_13B , 512ull * MB } ,
{ MODEL_30B , 512ull * MB } ,
2023-04-24 15:47:03 +00:00
{ MODEL_65B , 1024ull * MB } ,
2023-04-17 14:41:53 +00:00
} ;
return _MEM_REQ_SCRATCH1 ;
2023-04-22 08:08:12 +00:00
}
2023-03-24 21:17:37 +00:00
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
2023-04-17 14:41:53 +00:00
static const std : : map < e_model , size_t > & MEM_REQ_KV_SELF ( )
{
static std : : map < e_model , size_t > _MEM_REQ_KV_SELF = {
{ MODEL_7B , 1026ull * MB } ,
{ MODEL_13B , 1608ull * MB } ,
{ MODEL_30B , 3124ull * MB } ,
{ MODEL_65B , 5120ull * MB } ,
} ;
return _MEM_REQ_KV_SELF ;
2023-04-22 08:08:12 +00:00
}
2023-03-24 21:17:37 +00:00
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
2023-04-17 14:41:53 +00:00
static const std : : map < e_model , size_t > & MEM_REQ_EVAL ( )
{
static std : : map < e_model , size_t > _MEM_REQ_EVAL = {
{ MODEL_7B , 768ull * MB } ,
{ MODEL_13B , 1024ull * MB } ,
{ MODEL_30B , 1280ull * MB } ,
{ MODEL_65B , 1536ull * MB } ,
} ;
return _MEM_REQ_EVAL ;
2023-04-22 08:08:12 +00:00
}
2023-03-24 21:17:37 +00:00
2023-03-22 05:32:36 +00:00
// default hparams (LLaMA 7B)
struct llama_hparams {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
uint32_t n_vocab = 32000 ;
uint32_t n_ctx = 512 ; // this is provided as user input?
uint32_t n_embd = 4096 ;
uint32_t n_mult = 256 ;
uint32_t n_head = 32 ;
uint32_t n_layer = 32 ;
uint32_t n_rot = 64 ;
2023-04-11 15:03:51 +00:00
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
bool operator ! = ( const llama_hparams & other ) const {
return memcmp ( this , & other , sizeof ( llama_hparams ) ) ;
}
2023-03-22 05:32:36 +00:00
} ;
struct llama_layer {
// normalization
struct ggml_tensor * attention_norm ;
// attention
struct ggml_tensor * wq ;
struct ggml_tensor * wk ;
struct ggml_tensor * wv ;
struct ggml_tensor * wo ;
// normalization
struct ggml_tensor * ffn_norm ;
// ff
struct ggml_tensor * w1 ;
struct ggml_tensor * w2 ;
struct ggml_tensor * w3 ;
} ;
2023-03-24 21:17:37 +00:00
struct llama_kv_cache {
struct ggml_tensor * k ;
struct ggml_tensor * v ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
struct ggml_context * ctx = NULL ;
2023-03-24 21:17:37 +00:00
2023-04-29 00:04:18 +00:00
llama_ctx_buffer buf ;
2023-03-24 21:17:37 +00:00
int n ; // number of tokens currently in the cache
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
~ llama_kv_cache ( ) {
if ( ctx ) {
ggml_free ( ctx ) ;
}
}
2023-03-24 21:17:37 +00:00
} ;
2023-03-22 05:32:36 +00:00
struct llama_model {
2023-03-24 21:17:37 +00:00
e_model type = MODEL_UNKNOWN ;
2023-03-22 05:32:36 +00:00
llama_hparams hparams ;
struct ggml_tensor * tok_embeddings ;
struct ggml_tensor * norm ;
struct ggml_tensor * output ;
std : : vector < llama_layer > layers ;
2023-03-24 21:17:37 +00:00
// context
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
struct ggml_context * ctx = NULL ;
2023-03-24 21:17:37 +00:00
// key + value cache for the self attention
// TODO: move to llama_state
struct llama_kv_cache kv_self ;
// the model memory buffer
2023-04-29 00:04:18 +00:00
llama_ctx_buffer buf ;
2023-03-24 21:17:37 +00:00
2023-03-29 06:31:26 +00:00
// model memory mapped file
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
std : : unique_ptr < llama_mmap > mapping ;
2023-03-29 06:31:26 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
// objects representing data potentially being locked in memory
llama_mlock mlock_buf ;
llama_mlock mlock_mmap ;
// for quantize-stats only
std : : vector < std : : pair < std : : string , struct ggml_tensor * > > tensors_by_name ;
~ llama_model ( ) {
if ( ctx ) {
ggml_free ( ctx ) ;
}
}
2023-03-22 05:32:36 +00:00
} ;
struct llama_vocab {
using id = int32_t ;
using token = std : : string ;
struct token_score {
token tok ;
float score ;
} ;
std : : unordered_map < token , id > token_to_id ;
std : : vector < token_score > id_to_token ;
} ;
struct llama_context {
std : : mt19937 rng ;
int64_t t_load_us = 0 ;
int64_t t_start_us = 0 ;
2023-03-29 20:51:37 +00:00
bool has_evaluated_once = false ;
2023-03-22 05:32:36 +00:00
int64_t t_sample_us = 0 ;
int64_t t_eval_us = 0 ;
2023-03-25 14:34:23 +00:00
int64_t t_p_eval_us = 0 ;
2023-03-22 05:32:36 +00:00
int32_t n_sample = 0 ; // number of tokens sampled
int32_t n_eval = 0 ; // number of eval calls
2023-03-25 14:34:23 +00:00
int32_t n_p_eval = 0 ; // number of tokens in eval calls for the prompt (with batch size > 1)
2023-03-22 05:32:36 +00:00
llama_model model ;
llama_vocab vocab ;
size_t mem_per_token = 0 ;
// decode output (2-dimensional array: [n_tokens][n_vocab])
std : : vector < float > logits ;
bool logits_all = false ;
2023-03-24 15:05:13 +00:00
// input embedding (1-dimensional array: [n_embd])
std : : vector < float > embedding ;
2023-03-24 21:17:37 +00:00
// memory buffers used to evaluate the model
// TODO: move in llama_state
2023-04-29 00:04:18 +00:00
llama_ctx_buffer buf_compute ;
llama_ctx_buffer buf_scratch [ LLAMA_MAX_SCRATCH_BUFFERS ] ;
2023-03-24 21:17:37 +00:00
int buf_last = 0 ;
size_t buf_max_size [ LLAMA_MAX_SCRATCH_BUFFERS ] = { 0 } ;
void use_buf ( struct ggml_context * ctx , int i ) {
# if defined(LLAMA_USE_SCRATCH)
size_t last_size = 0 ;
if ( i = = - 1 ) {
last_size = ggml_set_scratch ( ctx , { 0 , 0 , nullptr , } ) ;
} else {
auto & buf = buf_scratch [ i ] ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
last_size = ggml_set_scratch ( ctx , { 0 , buf . size , buf . addr , } ) ;
2023-03-24 21:17:37 +00:00
}
if ( buf_last > = 0 ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
buf_max_size [ buf_last ] = std : : max ( buf_max_size [ buf_last ] , last_size ) ;
2023-03-24 21:17:37 +00:00
}
buf_last = i ;
# else
( void ) i ;
( void ) ctx ;
# endif
}
size_t get_buf_max_mem ( int i ) const {
# if defined(LLAMA_USE_SCRATCH)
return buf_max_size [ i ] ;
# else
( void ) i ;
return 0 ;
# endif
}
2023-03-22 05:32:36 +00:00
} ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
template < typename T >
static T checked_mul ( T a , T b ) {
T ret = a * b ;
if ( a ! = 0 & & ret / a ! = b ) {
throw format ( " overflow multiplying %llu * %llu " ,
( unsigned long long ) a , ( unsigned long long ) b ) ;
}
return ret ;
}
static size_t checked_div ( size_t a , size_t b ) {
if ( b = = 0 | | a % b ! = 0 ) {
throw format ( " error dividing %zu / %zu " , a , b ) ;
}
return a / b ;
}
static std : : string llama_format_tensor_shape ( const std : : vector < uint32_t > & ne ) {
2023-04-16 10:58:48 +00:00
char buf [ 256 ] ;
snprintf ( buf , sizeof ( buf ) , " %5u " , ne . at ( 0 ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
for ( size_t i = 1 ; i < ne . size ( ) ; i + + ) {
2023-04-16 10:58:48 +00:00
snprintf ( buf + strlen ( buf ) , sizeof ( buf ) - strlen ( buf ) , " x %5u " , ne . at ( i ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
2023-04-16 10:58:48 +00:00
return buf ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
static size_t llama_calc_tensor_size ( const std : : vector < uint32_t > & ne , enum ggml_type type ) {
size_t size = ggml_type_size ( type ) ;
for ( uint32_t dim : ne ) {
size = checked_mul < size_t > ( size , dim ) ;
}
return size / ggml_blck_size ( type ) ;
}
struct llama_load_tensor_shard {
std : : vector < uint32_t > ne ;
size_t size ;
enum ggml_type type ;
size_t file_idx ;
size_t file_off ;
void calc_size ( ) {
size = llama_calc_tensor_size ( ne , type ) ;
}
} ;
enum llama_split_type {
SPLIT_NONE ,
SPLIT_BY_COLUMNS ,
SPLIT_BY_ROWS
} ;
struct llama_load_tensor {
std : : vector < llama_load_tensor_shard > shards ;
std : : string name ;
enum ggml_type type = GGML_TYPE_F32 ;
llama_split_type split_type = SPLIT_NONE ;
std : : vector < uint32_t > ne ;
size_t size ;
struct ggml_tensor * ggml_tensor = NULL ;
uint8_t * data ;
llama_load_tensor ( const std : : string & name ) : name ( name ) { }
void calc_all ( ) {
calc_type ( ) ;
calc_split_type ( ) ;
calc_ne ( ) ;
calc_size ( ) ;
}
void calc_type ( ) {
const auto & first_shard = shards . at ( 0 ) ;
for ( const auto & shard : shards ) {
if ( shard . type ! = first_shard . type ) {
throw format ( " inconsistent tensor shard type in '%s' " , name . c_str ( ) ) ;
}
}
type = first_shard . type ;
}
void calc_split_type ( ) {
if ( shards . at ( 0 ) . ne . size ( ) = = 1 | | // 1D tensors are just duplicated in every file
shards . size ( ) = = 1 ) { // only one file?
split_type = SPLIT_NONE ;
} else if ( name . find ( " tok_embeddings. " ) = = 0 | |
name . find ( " .attention.wo.weight " ) ! = std : : string : : npos | |
name . find ( " .feed_forward.w2.weight " ) ! = std : : string : : npos ) {
split_type = SPLIT_BY_COLUMNS ;
} else {
split_type = SPLIT_BY_ROWS ;
}
}
void calc_ne ( ) {
const auto & first_shard = shards . at ( 0 ) ;
for ( const auto & shard : shards ) {
if ( shard . ne ! = first_shard . ne ) {
throw format ( " inconsistent tensor shard shape in '%s': first was %s, other was %s " ,
name . c_str ( ) , llama_format_tensor_shape ( first_shard . ne ) . c_str ( ) , llama_format_tensor_shape ( shard . ne ) . c_str ( ) ) ;
}
}
ne = first_shard . ne ;
LLAMA_ASSERT ( shards . size ( ) < = UINT32_MAX ) ;
uint32_t n_shards = ( uint32_t ) shards . size ( ) ;
switch ( split_type ) {
2023-04-08 20:08:21 +00:00
case SPLIT_NONE :
ne = first_shard . ne ;
break ;
case SPLIT_BY_COLUMNS :
ne = { checked_mul < uint32_t > ( first_shard . ne [ 0 ] , n_shards ) ,
first_shard . ne [ 1 ] } ;
break ;
case SPLIT_BY_ROWS :
ne = { first_shard . ne [ 0 ] ,
checked_mul < uint32_t > ( first_shard . ne [ 1 ] , n_shards ) } ;
break ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
}
void calc_size ( ) {
size = llama_calc_tensor_size ( ne , type ) ;
}
} ;
struct llama_load_tensors_map {
// tensors is kept in a separate vector to preserve file order
std : : vector < llama_load_tensor > tensors ;
std : : unordered_map < std : : string , size_t > name_to_idx ;
} ;
enum llama_file_version {
LLAMA_FILE_VERSION_GGML ,
LLAMA_FILE_VERSION_GGMF_V1 , // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1 , // added padding
} ;
struct llama_file_loader {
llama_file file ;
llama_file_version file_version ;
llama_hparams hparams ;
llama_vocab vocab ;
llama_file_loader ( const char * fname , size_t file_idx , llama_load_tensors_map & tensors_map )
: file ( fname , " rb " ) {
fprintf ( stderr , " llama.cpp: loading model from %s \n " , fname ) ;
read_magic ( ) ;
read_hparams ( ) ;
read_vocab ( ) ;
read_tensor_metadata ( file_idx , tensors_map ) ;
}
void read_magic ( ) {
uint32_t magic = file . read_u32 ( ) ;
uint32_t version = 0 ;
if ( magic ! = ' ggml ' ) {
version = file . read_u32 ( ) ;
}
if ( magic = = ' ggml ' & & version = = 0 ) {
file_version = LLAMA_FILE_VERSION_GGML ;
} else if ( magic = = ' ggmf ' & & version = = 1 ) {
file_version = LLAMA_FILE_VERSION_GGMF_V1 ;
} else if ( magic = = ' ggjt ' & & version = = 1 ) {
file_version = LLAMA_FILE_VERSION_GGJT_V1 ;
} else {
throw format ( " unknown (magic, version) combination : % 08 x , % 08 x ; is this really a GGML file ? " ,
magic , version ) ;
}
}
void read_hparams ( ) {
hparams . n_vocab = file . read_u32 ( ) ;
hparams . n_embd = file . read_u32 ( ) ;
hparams . n_mult = file . read_u32 ( ) ;
hparams . n_head = file . read_u32 ( ) ;
hparams . n_layer = file . read_u32 ( ) ;
hparams . n_rot = file . read_u32 ( ) ;
2023-04-11 15:03:51 +00:00
hparams . ftype = ( enum llama_ftype ) file . read_u32 ( ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
void read_vocab ( ) {
vocab . id_to_token . resize ( hparams . n_vocab ) ;
for ( uint32_t i = 0 ; i < hparams . n_vocab ; i + + ) {
uint32_t len = file . read_u32 ( ) ;
std : : string word = file . read_string ( len ) ;
float score = 0.0f ;
if ( file_version > = LLAMA_FILE_VERSION_GGMF_V1 ) {
file . read_raw ( & score , sizeof ( score ) ) ;
}
vocab . token_to_id [ word ] = i ;
auto & tok_score = vocab . id_to_token [ i ] ;
tok_score . tok = std : : move ( word ) ;
tok_score . score = score ;
}
}
void read_tensor_metadata ( size_t file_idx , llama_load_tensors_map & tensors_map ) {
while ( file . tell ( ) < file . size ) {
llama_load_tensor_shard shard ;
uint32_t n_dims = file . read_u32 ( ) ;
uint32_t name_len = file . read_u32 ( ) ;
2023-04-11 15:03:51 +00:00
shard . type = ( enum ggml_type ) file . read_u32 ( ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
shard . ne . resize ( n_dims ) ;
file . read_raw ( shard . ne . data ( ) , sizeof ( shard . ne [ 0 ] ) * n_dims ) ;
std : : string name = file . read_string ( name_len ) ;
if ( n_dims < 1 | | n_dims > 2 ) {
throw format ( " llama.cpp: tensor '%s' should not be %u-dimensional " , name . c_str ( ) , n_dims ) ;
}
2023-04-11 15:03:51 +00:00
switch ( shard . type ) {
case GGML_TYPE_F32 :
case GGML_TYPE_F16 :
case GGML_TYPE_Q4_0 :
case GGML_TYPE_Q4_1 :
2023-04-18 20:54:57 +00:00
case GGML_TYPE_Q4_2 :
2023-04-26 20:14:13 +00:00
case GGML_TYPE_Q5_0 :
case GGML_TYPE_Q5_1 :
2023-04-25 20:40:51 +00:00
case GGML_TYPE_Q8_0 :
2023-04-11 15:03:51 +00:00
break ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
default : {
2023-04-11 15:03:51 +00:00
throw format ( " unrecognized tensor type %u \n " , shard . type ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
}
if ( file_version > = LLAMA_FILE_VERSION_GGJT_V1 ) {
// skip to the next multiple of 32 bytes
file . seek ( - file . tell ( ) & 31 , SEEK_CUR ) ;
}
shard . file_idx = file_idx ;
shard . file_off = file . tell ( ) ;
shard . calc_size ( ) ;
file . seek ( shard . size , SEEK_CUR ) ;
auto it = tensors_map . name_to_idx . find ( name ) ;
size_t idx ;
if ( it ! = tensors_map . name_to_idx . end ( ) ) {
idx = it - > second ;
} else {
tensors_map . tensors . emplace_back ( name ) ;
idx = tensors_map . tensors . size ( ) - 1 ;
tensors_map . name_to_idx . emplace ( name , idx ) ;
}
tensors_map . tensors . at ( idx ) . shards . push_back ( shard ) ;
}
}
} ;
struct llama_file_saver {
llama_file file ;
llama_file_loader * any_file_loader ;
2023-04-11 15:03:51 +00:00
llama_file_saver ( const char * fname , llama_file_loader * any_file_loader , enum llama_ftype new_ftype )
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
: file ( fname , " wb " ) , any_file_loader ( any_file_loader ) {
fprintf ( stderr , " llama.cpp: saving model to %s \n " , fname ) ;
write_magic ( ) ;
2023-04-11 15:03:51 +00:00
write_hparams ( new_ftype ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
write_vocab ( ) ;
}
void write_magic ( ) {
file . write_u32 ( ' ggjt ' ) ; // magic
file . write_u32 ( 1 ) ; // version
}
2023-04-11 15:03:51 +00:00
void write_hparams ( enum llama_ftype new_ftype ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
const llama_hparams & hparams = any_file_loader - > hparams ;
file . write_u32 ( hparams . n_vocab ) ;
file . write_u32 ( hparams . n_embd ) ;
file . write_u32 ( hparams . n_mult ) ;
file . write_u32 ( hparams . n_head ) ;
file . write_u32 ( hparams . n_layer ) ;
file . write_u32 ( hparams . n_rot ) ;
2023-04-11 15:03:51 +00:00
file . write_u32 ( new_ftype ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
void write_vocab ( ) {
if ( any_file_loader - > file_version = = LLAMA_FILE_VERSION_GGML ) {
fprintf ( stderr , " llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores \n " ) ;
}
uint32_t n_vocab = any_file_loader - > hparams . n_vocab ;
for ( uint32_t i = 0 ; i < n_vocab ; i + + ) {
const auto & token_score = any_file_loader - > vocab . id_to_token . at ( i ) ;
file . write_u32 ( ( uint32_t ) token_score . tok . size ( ) ) ;
file . write_raw ( token_score . tok . data ( ) , token_score . tok . size ( ) ) ;
file . write_raw ( & token_score . score , sizeof ( token_score . score ) ) ;
}
}
void write_tensor ( llama_load_tensor & tensor , enum ggml_type new_type , const void * new_data , size_t new_size ) {
switch ( new_type ) {
2023-04-11 15:03:51 +00:00
case GGML_TYPE_F32 :
case GGML_TYPE_F16 :
case GGML_TYPE_Q4_0 :
case GGML_TYPE_Q4_1 :
2023-04-18 20:54:57 +00:00
case GGML_TYPE_Q4_2 :
2023-04-26 20:14:13 +00:00
case GGML_TYPE_Q5_0 :
case GGML_TYPE_Q5_1 :
2023-04-25 20:40:51 +00:00
case GGML_TYPE_Q8_0 :
2023-04-11 15:03:51 +00:00
break ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
default : LLAMA_ASSERT ( false ) ;
}
file . write_u32 ( ( uint32_t ) tensor . ne . size ( ) ) ;
file . write_u32 ( ( uint32_t ) tensor . name . size ( ) ) ;
2023-04-11 15:03:51 +00:00
file . write_u32 ( new_type ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
file . write_raw ( tensor . ne . data ( ) , sizeof ( tensor . ne [ 0 ] ) * tensor . ne . size ( ) ) ;
file . write_raw ( tensor . name . data ( ) , tensor . name . size ( ) ) ;
file . seek ( - file . tell ( ) & 31 , SEEK_CUR ) ;
LLAMA_ASSERT ( new_size = = llama_calc_tensor_size ( tensor . ne , new_type ) ) ;
file . write_raw ( new_data , new_size ) ;
}
} ;
struct llama_model_loader {
std : : vector < std : : unique_ptr < llama_file_loader > > file_loaders ;
llama_load_tensors_map tensors_map ;
bool use_mmap ;
size_t num_ggml_tensors_created = 0 ;
struct ggml_context * ggml_ctx = NULL ;
std : : unique_ptr < llama_mmap > mapping ;
llama_model_loader ( const std : : string & fname_base , bool use_mmap , bool vocab_only ) {
auto first_file = new llama_file_loader ( fname_base . c_str ( ) , 0 , tensors_map ) ;
file_loaders . emplace_back ( first_file ) ;
uint32_t n_parts = vocab_only ? 1 : guess_n_parts ( ) ;
for ( uint32_t i = 1 ; i < n_parts ; i + + ) {
std : : string fname = fname_base + " . " + std : : to_string ( i ) ;
auto ith_file = new llama_file_loader ( fname . c_str ( ) , i , tensors_map ) ;
file_loaders . emplace_back ( ith_file ) ;
if ( ith_file - > hparams ! = first_file - > hparams ) {
throw format ( " llama.cpp: hparams inconsistent between files " ) ;
}
}
if ( ! llama_mmap : : SUPPORTED ) {
use_mmap = false ;
}
if ( use_mmap & & alignment_prevents_mmap ( ) ) {
fprintf ( stderr , " llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this \n " ) ;
use_mmap = false ;
}
this - > use_mmap = use_mmap ;
for ( llama_load_tensor & lt : tensors_map . tensors ) {
lt . calc_all ( ) ;
}
}
bool alignment_prevents_mmap ( ) {
for ( const llama_load_tensor & lt : tensors_map . tensors ) {
for ( const llama_load_tensor_shard & shard : lt . shards ) {
if ( shard . file_off & 3 ) {
return true ;
}
}
}
return false ;
}
uint32_t guess_n_parts ( ) const {
auto it = tensors_map . name_to_idx . find ( " tok_embeddings.weight " ) ;
if ( it = = tensors_map . name_to_idx . end ( ) ) {
throw std : : string ( " missing tok_embeddings.weight " ) ;
}
const llama_load_tensor & lt = tensors_map . tensors . at ( it - > second ) ;
return file_loaders . at ( 0 ) - > hparams . n_embd / lt . shards . at ( 0 ) . ne . at ( 0 ) ;
}
void calc_sizes ( size_t * ctx_size_p , size_t * mmapped_size_p ) const {
* ctx_size_p = * mmapped_size_p = 0 ;
for ( const llama_load_tensor & lt : tensors_map . tensors ) {
* ctx_size_p + = sizeof ( struct ggml_tensor ) + GGML_OBJECT_SIZE ;
* ( use_mmap ? mmapped_size_p : ctx_size_p ) + = lt . size ;
}
}
struct ggml_tensor * get_tensor ( const std : : string & name , std : : vector < uint32_t > ne ) {
auto it = tensors_map . name_to_idx . find ( name ) ;
if ( it = = tensors_map . name_to_idx . end ( ) ) {
throw format ( " llama.cpp: tensor '%s' is missing from model " , name . c_str ( ) ) ;
}
llama_load_tensor & lt = tensors_map . tensors . at ( it - > second ) ;
if ( lt . ne ! = ne ) {
throw format ( " llama.cpp: tensor '%s' has wrong shape; expected %s, got %s " ,
name . c_str ( ) , llama_format_tensor_shape ( ne ) . c_str ( ) , llama_format_tensor_shape ( lt . ne ) . c_str ( ) ) ;
}
2023-04-17 15:28:55 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
return get_tensor_for ( lt ) ;
}
struct ggml_tensor * get_tensor_for ( llama_load_tensor & lt ) {
struct ggml_tensor * tensor ;
if ( lt . ne . size ( ) = = 2 ) {
tensor = ggml_new_tensor_2d ( ggml_ctx , lt . type , lt . ne . at ( 0 ) , lt . ne . at ( 1 ) ) ;
} else {
LLAMA_ASSERT ( lt . ne . size ( ) = = 1 ) ;
tensor = ggml_new_tensor_1d ( ggml_ctx , lt . type , lt . ne . at ( 0 ) ) ;
}
LLAMA_ASSERT ( lt . ggml_tensor = = NULL ) ; // if this fails, we called get_tensor twice on the same tensor
lt . ggml_tensor = tensor ;
num_ggml_tensors_created + + ;
return tensor ;
}
void done_getting_tensors ( ) {
if ( num_ggml_tensors_created ! = tensors_map . tensors . size ( ) ) {
throw std : : string ( " llama.cpp: file contained more tensors than expected " ) ;
}
}
void load_all_data ( llama_progress_callback progress_callback , void * progress_callback_user_data , llama_mlock * lmlock ) {
size_t data_size = 0 ;
for ( const llama_load_tensor & lt : tensors_map . tensors ) {
data_size + = lt . size ;
}
if ( use_mmap ) {
mapping . reset ( new llama_mmap ( & file_loaders . at ( 0 ) - > file ) ) ;
if ( ! lmlock ) {
// Don't call the callback since the actual loading will be lazy
// and we can't measure it.
progress_callback = NULL ;
}
if ( lmlock ) {
lmlock - > init ( mapping - > addr ) ;
}
}
size_t done_size = 0 ;
for ( llama_load_tensor & lt : tensors_map . tensors ) {
if ( progress_callback ) {
progress_callback ( ( float ) done_size / data_size , progress_callback_user_data ) ;
}
LLAMA_ASSERT ( lt . ggml_tensor ) ; // unused tensors should have been caught by load_data already
lt . data = ( uint8_t * ) lt . ggml_tensor - > data ;
load_data_for ( lt ) ;
lt . ggml_tensor - > data = lt . data ;
done_size + = lt . size ;
if ( use_mmap & & lmlock ) {
lmlock - > grow_to ( done_size ) ;
}
}
if ( progress_callback ) {
progress_callback ( 1.0f , progress_callback_user_data ) ;
}
}
void load_data_for ( llama_load_tensor & lt ) {
if ( use_mmap ) {
LLAMA_ASSERT ( lt . shards . size ( ) = = 1 ) ;
lt . data = ( uint8_t * ) mapping - > addr + lt . shards . at ( 0 ) . file_off ;
} else if ( lt . split_type = = SPLIT_NONE ) {
llama_file & file = file_loaders . at ( lt . shards . at ( 0 ) . file_idx ) - > file ;
file . seek ( lt . shards . at ( 0 ) . file_off , SEEK_SET ) ;
file . read_raw ( lt . data , lt . size ) ;
} else if ( lt . split_type = = SPLIT_BY_ROWS ) {
size_t offset = 0 ;
for ( llama_load_tensor_shard & shard : lt . shards ) {
llama_file & file = file_loaders . at ( shard . file_idx ) - > file ;
file . seek ( shard . file_off , SEEK_SET ) ;
file . read_raw ( lt . data + offset , shard . size ) ;
offset + = shard . size ;
}
LLAMA_ASSERT ( offset = = lt . size ) ;
} else if ( lt . split_type = = SPLIT_BY_COLUMNS ) {
// Let's load the data into temporary buffers to ensure the OS performs large loads.
std : : vector < llama_buffer > tmp_bufs ;
tmp_bufs . resize ( lt . shards . size ( ) ) ;
for ( size_t i = 0 ; i < lt . shards . size ( ) ; i + + ) {
llama_load_tensor_shard & shard = lt . shards . at ( i ) ;
llama_file & file = file_loaders . at ( shard . file_idx ) - > file ;
file . seek ( shard . file_off , SEEK_SET ) ;
tmp_bufs . at ( i ) . resize ( shard . size ) ;
file . read_raw ( tmp_bufs . at ( i ) . addr , shard . size ) ;
}
// Then reshape.
size_t num_rows = lt . ne . at ( 1 ) ;
size_t per_shard_row_size = lt . shards . at ( 0 ) . size / num_rows ;
size_t out_offset = 0 ;
for ( size_t row = 0 ; row < num_rows ; row + + ) {
for ( llama_buffer & tmp_buf : tmp_bufs ) {
memcpy ( lt . data + out_offset ,
tmp_buf . addr + row * per_shard_row_size ,
per_shard_row_size ) ;
out_offset + = per_shard_row_size ;
}
}
LLAMA_ASSERT ( out_offset = = lt . size ) ;
}
if ( 0 ) {
print_checksum ( lt ) ;
}
}
static void print_checksum ( llama_load_tensor & lt ) {
uint32_t sum = 0 ;
for ( size_t i = 0 ; i < lt . size ; i + + ) {
uint8_t byte = lt . data [ i ] ;
sum = byte + ( sum < < 6 ) + ( sum < < 16 ) - sum ; // sdbm hash
}
fprintf ( stderr , " %s checksum: %#08x (%s, size %zu) \n " , lt . name . c_str ( ) , sum ,
llama_format_tensor_shape ( lt . ne ) . c_str ( ) , lt . size ) ;
}
} ;
2023-03-24 21:17:37 +00:00
//
// kv cache
//
static bool kv_cache_init (
const struct llama_hparams & hparams ,
struct llama_kv_cache & cache ,
ggml_type wtype ,
int n_ctx ) {
const int n_embd = hparams . n_embd ;
const int n_layer = hparams . n_layer ;
2023-04-02 10:21:31 +00:00
const int64_t n_mem = ( int64_t ) n_layer * n_ctx ;
const int64_t n_elements = n_embd * n_mem ;
2023-03-24 21:17:37 +00:00
2023-03-25 04:38:14 +00:00
cache . buf . resize ( 2u * n_elements * ggml_type_size ( wtype ) + 2u * MB ) ;
2023-03-24 21:17:37 +00:00
struct ggml_init_params params ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
params . mem_size = cache . buf . size ;
params . mem_buffer = cache . buf . addr ;
2023-03-29 00:03:43 +00:00
params . no_alloc = false ;
2023-03-24 21:17:37 +00:00
cache . ctx = ggml_init ( params ) ;
if ( ! cache . ctx ) {
fprintf ( stderr , " %s: failed to allocate memory for kv cache \n " , __func__ ) ;
return false ;
}
cache . k = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
cache . v = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
return true ;
}
2023-03-22 05:32:36 +00:00
struct llama_context_params llama_context_default_params ( ) {
struct llama_context_params result = {
2023-03-25 05:26:28 +00:00
/*.n_ctx =*/ 512 ,
/*.n_parts =*/ - 1 ,
/*.seed =*/ 0 ,
/*.f16_kv =*/ false ,
/*.logits_all =*/ false ,
/*.vocab_only =*/ false ,
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
/*.use_mmap =*/ true ,
2023-03-25 05:26:28 +00:00
/*.use_mlock =*/ false ,
/*.embedding =*/ false ,
/*.progress_callback =*/ nullptr ,
/*.progress_callback_user_data =*/ nullptr ,
2023-03-22 05:32:36 +00:00
} ;
return result ;
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
bool llama_mmap_supported ( ) {
return llama_mmap : : SUPPORTED ;
2023-03-29 06:31:26 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
bool llama_mlock_supported ( ) {
return llama_mlock : : SUPPORTED ;
2023-03-29 00:03:43 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
//
// model loading
//
2023-03-29 20:51:37 +00:00
2023-04-08 20:08:21 +00:00
static const char * llama_file_version_name ( llama_file_version version ) {
switch ( version ) {
case LLAMA_FILE_VERSION_GGML : return " 'ggml' (old version with low tokenizer quality and no mmap support) " ;
case LLAMA_FILE_VERSION_GGMF_V1 : return " ggmf v1 (old version with no mmap support) " ;
case LLAMA_FILE_VERSION_GGJT_V1 : return " ggjt v1 (latest) " ;
default : LLAMA_ASSERT ( false ) ;
}
}
2023-04-11 15:03:51 +00:00
static const char * llama_ftype_name ( enum llama_ftype ftype ) {
switch ( ftype ) {
case LLAMA_FTYPE_ALL_F32 : return " all F32 " ;
case LLAMA_FTYPE_MOSTLY_F16 : return " mostly F16 " ;
case LLAMA_FTYPE_MOSTLY_Q4_0 : return " mostly Q4_0 " ;
case LLAMA_FTYPE_MOSTLY_Q4_1 : return " mostly Q4_1 " ;
2023-04-12 15:06:16 +00:00
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 :
return " mostly Q4_1, some F16 " ;
2023-04-18 20:54:57 +00:00
case LLAMA_FTYPE_MOSTLY_Q4_2 : return " mostly Q4_2 " ;
2023-04-26 20:14:13 +00:00
case LLAMA_FTYPE_MOSTLY_Q5_0 : return " mostly Q5_0 " ;
case LLAMA_FTYPE_MOSTLY_Q5_1 : return " mostly Q5_1 " ;
2023-04-25 20:40:51 +00:00
case LLAMA_FTYPE_MOSTLY_Q8_0 : return " mostly Q8_0 " ;
2023-04-12 15:06:16 +00:00
default : return " unknown, may not work " ;
2023-04-11 15:03:51 +00:00
}
}
2023-04-08 20:08:21 +00:00
static const char * llama_model_type_name ( e_model type ) {
switch ( type ) {
case MODEL_7B : return " 7B " ;
case MODEL_13B : return " 13B " ;
case MODEL_30B : return " 30B " ;
case MODEL_65B : return " 65B " ;
default : LLAMA_ASSERT ( false ) ;
}
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
static void llama_model_load_internal (
2023-03-22 05:32:36 +00:00
const std : : string & fname ,
llama_context & lctx ,
int n_ctx ,
ggml_type memory_type ,
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
bool use_mmap ,
bool use_mlock ,
2023-03-25 05:26:28 +00:00
bool vocab_only ,
llama_progress_callback progress_callback ,
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
void * progress_callback_user_data ) {
2023-03-22 05:32:36 +00:00
2023-03-29 20:51:37 +00:00
lctx . t_start_us = ggml_time_us ( ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
std : : unique_ptr < llama_model_loader > ml ( new llama_model_loader ( fname , use_mmap , vocab_only ) ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
lctx . vocab = std : : move ( ml - > file_loaders . at ( 0 ) - > vocab ) ;
auto & model = lctx . model ;
2023-04-08 20:08:21 +00:00
model . hparams = ml - > file_loaders . at ( 0 ) - > hparams ;
llama_file_version file_version = ml - > file_loaders . at ( 0 ) - > file_version ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
auto & hparams = model . hparams ;
uint32_t n_ff = ( ( 2 * ( 4 * hparams . n_embd ) / 3 + hparams . n_mult - 1 ) / hparams . n_mult ) * hparams . n_mult ;
2023-03-29 20:51:37 +00:00
2023-03-22 05:32:36 +00:00
{
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
switch ( hparams . n_layer ) {
case 32 : model . type = e_model : : MODEL_7B ; break ;
case 40 : model . type = e_model : : MODEL_13B ; break ;
case 60 : model . type = e_model : : MODEL_30B ; break ;
case 80 : model . type = e_model : : MODEL_65B ; break ;
2023-03-22 05:32:36 +00:00
}
hparams . n_ctx = n_ctx ;
2023-04-08 20:08:21 +00:00
}
2023-03-22 05:32:36 +00:00
2023-04-08 20:08:21 +00:00
{
fprintf ( stderr , " %s: format = %s \n " , __func__ , llama_file_version_name ( file_version ) ) ;
fprintf ( stderr , " %s: n_vocab = %u \n " , __func__ , hparams . n_vocab ) ;
fprintf ( stderr , " %s: n_ctx = %u \n " , __func__ , hparams . n_ctx ) ;
fprintf ( stderr , " %s: n_embd = %u \n " , __func__ , hparams . n_embd ) ;
fprintf ( stderr , " %s: n_mult = %u \n " , __func__ , hparams . n_mult ) ;
fprintf ( stderr , " %s: n_head = %u \n " , __func__ , hparams . n_head ) ;
fprintf ( stderr , " %s: n_layer = %u \n " , __func__ , hparams . n_layer ) ;
fprintf ( stderr , " %s: n_rot = %u \n " , __func__ , hparams . n_rot ) ;
2023-04-11 15:03:51 +00:00
fprintf ( stderr , " %s: ftype = %u (%s) \n " , __func__ , hparams . ftype , llama_ftype_name ( hparams . ftype ) ) ;
2023-04-08 20:08:21 +00:00
fprintf ( stderr , " %s: n_ff = %u \n " , __func__ , n_ff ) ;
fprintf ( stderr , " %s: n_parts = %zu \n " , __func__ , ml - > file_loaders . size ( ) ) ;
fprintf ( stderr , " %s: model size = %s \n " , __func__ , llama_model_type_name ( model . type ) ) ;
2023-03-22 05:32:36 +00:00
}
if ( vocab_only ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
return ;
2023-03-29 00:03:43 +00:00
}
2023-03-22 05:32:36 +00:00
auto & ctx = model . ctx ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
size_t ctx_size , mmapped_size ;
ml - > calc_sizes ( & ctx_size , & mmapped_size ) ;
fprintf ( stderr , " %s: ggml ctx size = %6.2f KB \n " , __func__ , ctx_size / 1024.0 ) ;
2023-03-22 05:32:36 +00:00
2023-03-24 21:17:37 +00:00
// print memory requirements
{
const size_t scale = memory_type = = GGML_TYPE_F32 ? 2 : 1 ;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
mmapped_size +
2023-04-17 14:41:53 +00:00
MEM_REQ_SCRATCH0 ( ) . at ( model . type ) +
MEM_REQ_SCRATCH1 ( ) . at ( model . type ) +
MEM_REQ_EVAL ( ) . at ( model . type ) ;
2023-03-24 21:17:37 +00:00
// this is the memory required by one llama_state
const size_t mem_required_state =
2023-04-17 14:41:53 +00:00
scale * MEM_REQ_KV_SELF ( ) . at ( model . type ) ;
2023-03-24 21:17:37 +00:00
fprintf ( stderr , " %s: mem required = %7.2f MB (+ %7.2f MB per state) \n " , __func__ ,
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 ) ;
}
2023-03-22 05:32:36 +00:00
// create the ggml context
{
2023-03-24 21:17:37 +00:00
lctx . model . buf . resize ( ctx_size ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
if ( use_mlock ) {
lctx . model . mlock_buf . init ( lctx . model . buf . addr ) ;
lctx . model . mlock_buf . grow_to ( lctx . model . buf . size ) ;
}
2023-03-24 21:17:37 +00:00
2023-03-22 05:32:36 +00:00
struct ggml_init_params params = {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
/*.mem_size =*/ lctx . model . buf . size ,
/*.mem_buffer =*/ lctx . model . buf . addr ,
/*.no_alloc =*/ ml - > use_mmap ,
2023-03-22 05:32:36 +00:00
} ;
model . ctx = ggml_init ( params ) ;
if ( ! model . ctx ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
throw format ( " ggml_init() failed " ) ;
2023-03-22 05:32:36 +00:00
}
}
// prepare memory for the weights
{
const auto & hparams = model . hparams ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
const uint32_t n_embd = hparams . n_embd ;
const uint32_t n_layer = hparams . n_layer ;
const uint32_t n_vocab = hparams . n_vocab ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
ml - > ggml_ctx = ctx ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
model . tok_embeddings = ml - > get_tensor ( " tok_embeddings.weight " , { n_embd , n_vocab } ) ;
2023-04-16 10:58:48 +00:00
model . norm = ml - > get_tensor ( " norm.weight " , { n_embd } ) ;
model . output = ml - > get_tensor ( " output.weight " , { n_embd , n_vocab } ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
model . layers . resize ( n_layer ) ;
for ( uint32_t i = 0 ; i < n_layer ; + + i ) {
2023-03-22 05:32:36 +00:00
auto & layer = model . layers [ i ] ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
std : : string layers_i = " layers. " + std : : to_string ( i ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
layer . attention_norm = ml - > get_tensor ( layers_i + " .attention_norm.weight " , { n_embd } ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
layer . wq = ml - > get_tensor ( layers_i + " .attention.wq.weight " , { n_embd , n_embd } ) ;
layer . wk = ml - > get_tensor ( layers_i + " .attention.wk.weight " , { n_embd , n_embd } ) ;
layer . wv = ml - > get_tensor ( layers_i + " .attention.wv.weight " , { n_embd , n_embd } ) ;
layer . wo = ml - > get_tensor ( layers_i + " .attention.wo.weight " , { n_embd , n_embd } ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
layer . ffn_norm = ml - > get_tensor ( layers_i + " .ffn_norm.weight " , { n_embd } ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
layer . w1 = ml - > get_tensor ( layers_i + " .feed_forward.w1.weight " , { n_embd , n_ff } ) ;
layer . w2 = ml - > get_tensor ( layers_i + " .feed_forward.w2.weight " , { n_ff , n_embd } ) ;
layer . w3 = ml - > get_tensor ( layers_i + " .feed_forward.w3.weight " , { n_embd , n_ff } ) ;
2023-03-22 05:32:36 +00:00
}
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
ml - > done_getting_tensors ( ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
// populate `tensors_by_name`
for ( llama_load_tensor & lt : ml - > tensors_map . tensors ) {
model . tensors_by_name . emplace_back ( lt . name , lt . ggml_tensor ) ;
2023-03-25 05:26:28 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
ml - > load_all_data ( progress_callback , progress_callback_user_data , use_mlock ? & lctx . model . mlock_mmap : NULL ) ;
2023-03-25 05:26:28 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
model . mapping = std : : move ( ml - > mapping ) ;
2023-03-22 05:32:36 +00:00
2023-03-29 20:51:37 +00:00
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
lctx . t_load_us = ggml_time_us ( ) - lctx . t_start_us ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
}
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
static bool llama_model_load (
const std : : string & fname ,
llama_context & lctx ,
int n_ctx ,
ggml_type memory_type ,
bool use_mmap ,
bool use_mlock ,
bool vocab_only ,
llama_progress_callback progress_callback ,
void * progress_callback_user_data ) {
try {
llama_model_load_internal ( fname , lctx , n_ctx , memory_type , use_mmap , use_mlock ,
vocab_only , progress_callback , progress_callback_user_data ) ;
return true ;
} catch ( const std : : string & err ) {
fprintf ( stderr , " error loading model: %s \n " , err . c_str ( ) ) ;
return false ;
2023-03-25 05:26:28 +00:00
}
2023-03-22 05:32:36 +00:00
}
// evaluate the transformer
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static bool llama_eval_internal (
llama_context & lctx ,
const llama_token * tokens ,
const int n_tokens ,
const int n_past ,
const int n_threads ) {
const int64_t t_start_us = ggml_time_us ( ) ;
const int N = n_tokens ;
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
2023-03-24 21:17:37 +00:00
auto & kv_self = model . kv_self ;
LLAMA_ASSERT ( ! ! kv_self . ctx ) ;
2023-03-22 05:32:36 +00:00
const int n_embd = hparams . n_embd ;
const int n_layer = hparams . n_layer ;
const int n_ctx = hparams . n_ctx ;
const int n_head = hparams . n_head ;
const int n_vocab = hparams . n_vocab ;
const int n_rot = hparams . n_embd / hparams . n_head ;
auto & mem_per_token = lctx . mem_per_token ;
2023-03-24 21:17:37 +00:00
auto & buf_compute = lctx . buf_compute ;
2023-03-22 05:32:36 +00:00
struct ggml_init_params params = {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
/*.mem_size =*/ buf_compute . size ,
/*.mem_buffer =*/ buf_compute . addr ,
2023-03-29 00:03:43 +00:00
/*.no_alloc =*/ false ,
2023-03-22 05:32:36 +00:00
} ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
2023-03-25 15:03:10 +00:00
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2023-03-22 05:32:36 +00:00
ggml_cgraph gf = { } ;
ggml : add CLBlast support (#1164)
* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing
* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers
* Finish merge of ClBlast support
* Move CLBlast implementation to separate file
Add buffer reuse code (adapted from slaren's cuda implementation)
* Add q4_2 and q4_3 CLBlast support, improve code
* Double CLBlast speed by disabling OpenBLAS thread workaround
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
* Fix device selection env variable names
* Fix cast in opencl kernels
* Add CLBlast to CMakeLists.txt
* Replace buffer pool with static buffers a, b, qb, c
Fix compile warnings
* Fix typos, use GGML_TYPE defines, improve code
* Improve btype dequant kernel selection code, add error if type is unsupported
* Improve code quality
* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read
Co-authored-by: Henri Vasserman <henv@hot.ee>
* Use c compiler for opencl files
* Simplify code, fix include
* First check error, then release event
* Make globals static, fix indentation
* Rename dequant kernels file to conform with other file names
* Fix import cl file name
---------
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-28 14:57:16 +00:00
gf . n_threads = N > = 32 & & ggml_cpu_has_blas ( ) & & ! ggml_cpu_has_gpublas ( ) ? 1 : n_threads ;
2023-03-22 05:32:36 +00:00
struct ggml_tensor * embd = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , N ) ;
memcpy ( embd - > data , tokens , N * ggml_element_size ( embd ) ) ;
struct ggml_tensor * inpL = ggml_get_rows ( ctx0 , model . tok_embeddings , embd ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
struct ggml_tensor * cur ;
2023-03-24 21:17:37 +00:00
lctx . use_buf ( ctx0 , 0 ) ;
2023-03-22 05:32:36 +00:00
// norm
{
cur = ggml_rms_norm ( ctx0 , inpL ) ;
// cur = attention_norm*cur
cur = ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . layers [ il ] . attention_norm , cur ) ,
cur ) ;
}
// self-attention
{
2023-04-05 19:07:33 +00:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_rope ( ctx0 , ggml_reshape_3d ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . wq , cur ) , n_embd / n_head , n_head , N ) , n_past , n_rot , 0 ) ;
struct ggml_tensor * Kcur = ggml_rope ( ctx0 , ggml_reshape_3d ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . wk , cur ) , n_embd / n_head , n_head , N ) , n_past , n_rot , 0 ) ;
2023-03-22 05:32:36 +00:00
// store key and value to memory
2023-04-05 19:07:33 +00:00
{
// compute the transposed [N, n_embd] V matrix
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . wv , cur ) , n_embd , N ) ) ;
2023-03-24 21:17:37 +00:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , N * n_embd , ( ggml_element_size ( kv_self . k ) * n_embd ) * ( il * n_ctx + n_past ) ) ;
2023-04-05 19:07:33 +00:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , N , n_embd ,
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd + n_past * ggml_element_size ( kv_self . v ) ) ;
2023-03-22 05:32:36 +00:00
2023-04-05 19:07:33 +00:00
// important: storing RoPE-ed version of K in the KV cache!
2023-03-22 05:32:36 +00:00
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
}
struct ggml_tensor * Q =
ggml_permute ( ctx0 ,
2023-04-05 19:07:33 +00:00
Qcur ,
2023-03-22 05:32:36 +00:00
0 , 2 , 1 , 3 ) ;
struct ggml_tensor * K =
ggml_permute ( ctx0 ,
2023-04-05 19:07:33 +00:00
ggml_reshape_3d ( ctx0 ,
ggml_view_1d ( ctx0 , kv_self . k , ( n_past + N ) * n_embd , il * n_ctx * ggml_element_size ( kv_self . k ) * n_embd ) ,
n_embd / n_head , n_head , n_past + N ) ,
2023-03-22 05:32:36 +00:00
0 , 2 , 1 , 3 ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale ( ctx0 ,
KQ ,
2023-03-28 16:48:20 +00:00
ggml_new_f32 ( ctx0 , 1.0f / sqrtf ( float ( n_embd ) / n_head ) ) ) ;
2023-03-22 05:32:36 +00:00
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf ( ctx0 , KQ_scaled , n_past ) ;
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max ( ctx0 , KQ_masked ) ;
2023-04-05 19:07:33 +00:00
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
n_past + N , n_embd / n_head , n_head ,
n_ctx * ggml_element_size ( kv_self . v ) ,
n_ctx * ggml_element_size ( kv_self . v ) * n_embd / n_head ,
il * n_ctx * ggml_element_size ( kv_self . v ) * n_embd ) ;
2023-03-22 05:32:36 +00:00
2023-04-05 19:07:33 +00:00
# if 1
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
# else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way?
struct ggml_tensor * V_cont = ggml_cpy ( ctx0 , V , ggml_new_tensor_3d ( ctx0 , kv_self . v - > type , n_past + N , n_embd / n_head , n_head ) ) ;
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V_cont , KQ_soft_max ) ;
# endif
2023-03-22 05:32:36 +00:00
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy ( ctx0 ,
KQV_merged ,
ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , N ) ) ;
// projection (no bias)
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . wo ,
cur ) ;
}
2023-03-24 21:17:37 +00:00
lctx . use_buf ( ctx0 , 1 ) ;
2023-03-22 05:32:36 +00:00
struct ggml_tensor * inpFF = ggml_add ( ctx0 , cur , inpSA ) ;
// feed-forward network
{
// norm
{
cur = ggml_rms_norm ( ctx0 , inpFF ) ;
// cur = ffn_norm*cur
cur = ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . layers [ il ] . ffn_norm , cur ) ,
cur ) ;
}
struct ggml_tensor * tmp = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w3 ,
cur ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w1 ,
cur ) ;
// SILU activation
cur = ggml_silu ( ctx0 , cur ) ;
cur = ggml_mul ( ctx0 , cur , tmp ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w2 ,
cur ) ;
}
2023-03-24 21:17:37 +00:00
cur = ggml_add ( ctx0 , cur , inpFF ) ;
2023-03-22 05:32:36 +00:00
// input for next layer
inpL = cur ;
}
2023-03-24 21:17:37 +00:00
lctx . use_buf ( ctx0 , 0 ) ;
2023-03-24 15:05:13 +00:00
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL ;
2023-03-22 05:32:36 +00:00
// norm
{
2023-03-24 21:17:37 +00:00
2023-03-22 05:32:36 +00:00
inpL = ggml_rms_norm ( ctx0 , inpL ) ;
// inpL = norm*inpL
inpL = ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . norm , inpL ) ,
inpL ) ;
2023-03-24 15:05:13 +00:00
embeddings = inpL ;
2023-03-22 05:32:36 +00:00
}
// lm_head
2023-03-24 21:17:37 +00:00
inpL = ggml_mul_mat ( ctx0 , model . output , inpL ) ;
lctx . use_buf ( ctx0 , - 1 ) ;
2023-03-22 05:32:36 +00:00
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand ( & gf , inpL ) ;
ggml_graph_compute ( ctx0 , & gf ) ;
2023-04-23 15:15:39 +00:00
# ifdef GGML_PERF
2023-04-05 19:07:33 +00:00
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
2023-04-23 15:15:39 +00:00
ggml_graph_print ( & gf ) ;
# endif
2023-04-05 19:07:33 +00:00
// plot the computation graph in dot format (for debugging purposes)
2023-03-22 05:32:36 +00:00
//if (n_past%100 == 0) {
2023-04-05 19:07:33 +00:00
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
2023-03-22 05:32:36 +00:00
//}
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
2023-03-24 15:05:13 +00:00
// extract logits
{
auto & logits_out = lctx . logits ;
if ( lctx . logits_all ) {
logits_out . resize ( n_vocab * N ) ;
memcpy ( logits_out . data ( ) , ( float * ) ggml_get_data ( inpL ) , sizeof ( float ) * n_vocab * N ) ;
} else {
// return result for just the last token
logits_out . resize ( n_vocab ) ;
memcpy ( logits_out . data ( ) , ( float * ) ggml_get_data ( inpL ) + ( n_vocab * ( N - 1 ) ) , sizeof ( float ) * n_vocab ) ;
}
}
// extract embeddings
if ( lctx . embedding . size ( ) ) {
auto & embedding_out = lctx . embedding ;
2023-03-22 05:32:36 +00:00
2023-03-24 15:05:13 +00:00
embedding_out . resize ( n_embd ) ;
memcpy ( embedding_out . data ( ) , ( float * ) ggml_get_data ( embeddings ) + ( n_embd * ( N - 1 ) ) , sizeof ( float ) * n_embd ) ;
2023-03-22 05:32:36 +00:00
}
2023-03-24 04:22:28 +00:00
if ( mem_per_token = = 0 ) {
mem_per_token = ggml_used_mem ( ctx0 ) / N ;
2023-03-22 05:32:36 +00:00
}
2023-03-24 21:17:37 +00:00
#if 0
printf ( " \n %s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB \n " , __func__ ,
ggml_used_mem ( ctx0 ) / 1024.0 / 1024.0 ,
lctx . get_buf_max_mem ( 0 ) / 1024.0 / 1024.0 ,
lctx . get_buf_max_mem ( 1 ) / 1024.0 / 1024.0 ) ;
# endif
2023-03-22 05:32:36 +00:00
ggml_free ( ctx0 ) ;
// measure the performance only for the single-token evals
if ( N = = 1 ) {
lctx . t_eval_us + = ggml_time_us ( ) - t_start_us ;
lctx . n_eval + + ;
}
2023-03-25 14:34:23 +00:00
else if ( N > 1 ) {
lctx . t_p_eval_us + = ggml_time_us ( ) - t_start_us ;
lctx . n_p_eval + = N ;
}
2023-03-22 05:32:36 +00:00
return true ;
}
//
// tokenizer
//
static size_t utf8_len ( char src ) {
const size_t lookup [ ] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 3 , 4 } ;
uint8_t highbits = static_cast < uint8_t > ( src ) > > 4 ;
return lookup [ highbits ] ;
}
struct llama_sp_symbol {
using index = int ;
index prev ;
index next ;
const char * text ;
size_t n ;
} ;
struct llama_sp_bigram {
struct comparator {
bool operator ( ) ( llama_sp_bigram & l , llama_sp_bigram & r ) {
return ( l . score < r . score ) | | ( l . score = = r . score & & l . left > r . left ) ;
}
} ;
using queue_storage = std : : vector < llama_sp_bigram > ;
using queue = std : : priority_queue < llama_sp_bigram , queue_storage , comparator > ;
llama_sp_symbol : : index left ;
llama_sp_symbol : : index right ;
float score ;
size_t size ;
} ;
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
struct llama_tokenizer {
llama_tokenizer ( const llama_vocab & vocab ) : vocab_ ( vocab ) { }
void tokenize ( const std : : string & text , std : : vector < llama_vocab : : id > & output ) {
// split string into utf8 chars
int index = 0 ;
size_t offs = 0 ;
while ( offs < text . size ( ) ) {
llama_sp_symbol sym ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
size_t char_len = std : : min ( text . size ( ) - offs , utf8_len ( text [ offs ] ) ) ;
2023-03-22 05:32:36 +00:00
sym . text = text . c_str ( ) + offs ;
sym . n = char_len ;
offs + = char_len ;
sym . prev = index - 1 ;
sym . next = offs = = text . size ( ) ? - 1 : index + 1 ;
index + + ;
symbols_ . emplace_back ( std : : move ( sym ) ) ;
}
// seed the work queue with all possible 2-character tokens.
for ( size_t i = 1 ; i < symbols_ . size ( ) ; + + i ) {
try_add_bigram ( i - 1 , i ) ;
}
// keep substituting the highest frequency pairs for as long as we can.
while ( ! work_queue_ . empty ( ) ) {
auto bigram = work_queue_ . top ( ) ;
work_queue_ . pop ( ) ;
auto & left_sym = symbols_ [ bigram . left ] ;
auto & right_sym = symbols_ [ bigram . right ] ;
// if one of the symbols already got merged, skip it.
if ( left_sym . n = = 0 | | right_sym . n = = 0 | |
left_sym . n + right_sym . n ! = bigram . size ) {
continue ;
}
// merge the right sym into the left one
left_sym . n + = right_sym . n ;
right_sym . n = 0 ;
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
// remove the right sym from the chain
left_sym . next = right_sym . next ;
if ( right_sym . next > = 0 ) {
symbols_ [ right_sym . next ] . prev = bigram . left ;
}
// find more substitutions
try_add_bigram ( left_sym . prev , bigram . left ) ;
try_add_bigram ( bigram . left , left_sym . next ) ;
}
for ( int i = 0 ; i ! = - 1 ; i = symbols_ [ i ] . next ) {
auto & symbol = symbols_ [ i ] ;
auto token = vocab_ . token_to_id . find ( std : : string ( symbol . text , symbol . n ) ) ;
if ( token = = vocab_ . token_to_id . end ( ) ) {
// output any symbols that did not form tokens as bytes.
for ( int j = 0 ; j < ( int ) symbol . n ; + + j ) {
llama_vocab : : id token_id = static_cast < uint8_t > ( symbol . text [ j ] ) + 3 ;
output . push_back ( token_id ) ;
}
} else {
output . push_back ( ( * token ) . second ) ;
}
}
}
private :
void try_add_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
const std : : string text = std : : string ( symbols_ [ left ] . text , symbols_ [ left ] . n + symbols_ [ right ] . n ) ;
auto token = vocab_ . token_to_id . find ( text ) ;
if ( token = = vocab_ . token_to_id . end ( ) ) {
return ;
}
if ( static_cast < size_t > ( ( * token ) . second ) > = vocab_ . id_to_token . size ( ) ) {
return ;
}
const auto & tok_score = vocab_ . id_to_token [ ( * token ) . second ] ;
llama_sp_bigram bigram ;
bigram . left = left ;
bigram . right = right ;
bigram . score = tok_score . score ;
bigram . size = text . size ( ) ;
work_queue_ . push ( bigram ) ;
}
const llama_vocab & vocab_ ;
std : : vector < llama_sp_symbol > symbols_ ;
llama_sp_bigram : : queue work_queue_ ;
} ;
static std : : vector < llama_vocab : : id > llama_tokenize ( const llama_vocab & vocab , const std : : string & text , bool bos ) {
llama_tokenizer tokenizer ( vocab ) ;
std : : vector < llama_vocab : : id > output ;
if ( text . size ( ) = = 0 ) {
return output ;
}
if ( bos ) {
output . push_back ( 1 ) ;
}
tokenizer . tokenize ( text , output ) ;
return output ;
}
//
// sampling
//
2023-03-28 16:48:20 +00:00
static void sample_top_k ( std : : vector < std : : pair < float , llama_vocab : : id > > & logits_id , int top_k ) {
2023-03-22 05:32:36 +00:00
// find the top k tokens
std : : partial_sort (
logits_id . begin ( ) ,
logits_id . begin ( ) + top_k , logits_id . end ( ) ,
2023-03-28 16:48:20 +00:00
[ ] ( const std : : pair < float , llama_vocab : : id > & a , const std : : pair < float , llama_vocab : : id > & b ) {
2023-03-22 05:32:36 +00:00
return a . first > b . first ;
} ) ;
logits_id . resize ( top_k ) ;
}
static llama_vocab : : id llama_sample_top_p_top_k (
llama_context & lctx ,
const std : : vector < llama_vocab : : id > & last_n_tokens ,
int top_k ,
2023-03-28 16:48:20 +00:00
float top_p ,
float temp ,
float repeat_penalty ) {
2023-03-22 05:32:36 +00:00
auto & rng = lctx . rng ;
2023-03-25 18:51:14 +00:00
const int n_logits = lctx . model . hparams . n_vocab ;
2023-03-22 05:32:36 +00:00
2023-03-25 18:51:14 +00:00
const auto & logits = lctx . logits ;
const auto * plogits = logits . data ( ) + logits . size ( ) - n_logits ;
2023-03-22 05:32:36 +00:00
2023-04-03 00:19:04 +00:00
if ( temp < = 0 ) {
// select the token with the highest logit directly
float max_logit = plogits [ 0 ] ;
llama_vocab : : id max_id = 0 ;
for ( int i = 1 ; i < n_logits ; + + i ) {
if ( plogits [ i ] > max_logit ) {
max_logit = plogits [ i ] ;
max_id = i ;
}
}
return max_id ;
}
2023-03-28 16:48:20 +00:00
std : : vector < std : : pair < float , llama_vocab : : id > > logits_id ;
2023-03-22 05:32:36 +00:00
logits_id . reserve ( n_logits ) ;
{
2023-03-28 16:48:20 +00:00
const float scale = 1.0f / temp ;
2023-03-22 05:32:36 +00:00
for ( int i = 0 ; i < n_logits ; + + i ) {
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if ( std : : find ( last_n_tokens . begin ( ) , last_n_tokens . end ( ) , i ) ! = last_n_tokens . end ( ) ) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
2023-03-28 16:48:20 +00:00
if ( plogits [ i ] < 0.0f ) {
2023-03-25 18:51:14 +00:00
logits_id . push_back ( std : : make_pair ( plogits [ i ] * scale * repeat_penalty , i ) ) ;
2023-03-22 05:32:36 +00:00
} else {
2023-03-25 18:51:14 +00:00
logits_id . push_back ( std : : make_pair ( plogits [ i ] * scale / repeat_penalty , i ) ) ;
2023-03-22 05:32:36 +00:00
}
} else {
2023-03-25 18:51:14 +00:00
logits_id . push_back ( std : : make_pair ( plogits [ i ] * scale , i ) ) ;
2023-03-22 05:32:36 +00:00
}
}
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
sample_top_k ( logits_id , top_k > 0 ? std : : min ( top_k , n_logits ) : n_logits ) ;
2023-03-22 05:32:36 +00:00
// compute probs for the top k tokens
2023-03-28 16:48:20 +00:00
std : : vector < float > probs ;
2023-03-22 05:32:36 +00:00
probs . reserve ( logits_id . size ( ) ) ;
2023-04-07 16:02:12 +00:00
float maxl = logits_id [ 0 ] . first ;
2023-03-22 05:32:36 +00:00
double sum = 0.0 ;
for ( const auto & kv : logits_id ) {
2023-03-28 16:48:20 +00:00
const float p = expf ( kv . first - maxl ) ;
2023-03-22 05:32:36 +00:00
probs . push_back ( p ) ;
sum + = p ;
}
// normalize the probs
for ( auto & p : probs ) {
p / = sum ;
}
2023-03-28 16:48:20 +00:00
if ( top_p < 1.0 ) {
double cumsum = 0.0 ;
2023-03-22 05:32:36 +00:00
for ( int i = 0 ; i < ( int ) probs . size ( ) ; i + + ) {
cumsum + = probs [ i ] ;
if ( cumsum > = top_p ) {
probs . resize ( i + 1 ) ;
logits_id . resize ( i + 1 ) ;
break ;
}
}
}
//printf("\n");
//for (int i = 0; i < (int) 10; i++) {
2023-04-07 16:02:12 +00:00
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
2023-03-22 05:32:36 +00:00
//}
//printf("\n\n");
//exit(0);
std : : discrete_distribution < > dist ( probs . begin ( ) , probs . end ( ) ) ;
int idx = dist ( rng ) ;
return logits_id [ idx ] . second ;
}
//
// quantization
//
2023-04-20 17:42:27 +00:00
static void llama_model_quantize_internal ( const std : : string & fname_inp , const std : : string & fname_out , enum llama_ftype ftype , int nthread ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
ggml_type quantized_type ;
2023-04-11 15:03:51 +00:00
switch ( ftype ) {
case LLAMA_FTYPE_MOSTLY_Q4_0 : quantized_type = GGML_TYPE_Q4_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q4_1 : quantized_type = GGML_TYPE_Q4_1 ; break ;
2023-04-18 20:54:57 +00:00
case LLAMA_FTYPE_MOSTLY_Q4_2 : quantized_type = GGML_TYPE_Q4_2 ; break ;
2023-04-26 20:14:13 +00:00
case LLAMA_FTYPE_MOSTLY_Q5_0 : quantized_type = GGML_TYPE_Q5_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_1 : quantized_type = GGML_TYPE_Q5_1 ; break ;
2023-04-25 20:40:51 +00:00
case LLAMA_FTYPE_MOSTLY_Q8_0 : quantized_type = GGML_TYPE_Q8_0 ; break ;
2023-04-11 15:03:51 +00:00
default : throw format ( " invalid output file type %d \n " , ftype ) ;
2023-03-22 05:32:36 +00:00
} ;
2023-04-20 17:42:27 +00:00
if ( nthread < = 0 ) {
nthread = std : : thread : : hardware_concurrency ( ) ;
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
std : : unique_ptr < llama_model_loader > model_loader ( new llama_model_loader ( fname_inp . c_str ( ) , /*use_mmap*/ false ,
/*vocab_only*/ false ) ) ;
2023-04-11 15:03:51 +00:00
llama_file_saver file_saver ( fname_out . c_str ( ) , model_loader - > file_loaders . at ( 0 ) . get ( ) , ftype ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
size_t total_size_org = 0 ;
size_t total_size_new = 0 ;
std : : vector < int64_t > hist_all ( 1 < < 4 , 0 ) ;
2023-04-20 17:42:27 +00:00
std : : vector < std : : thread > workers ;
std : : mutex mutex ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
size_t idx = 0 ;
for ( llama_load_tensor & tensor : model_loader - > tensors_map . tensors ) {
llama_buffer read_data ;
read_data . resize ( tensor . size ) ;
tensor . data = read_data . addr ;
model_loader - > load_data_for ( tensor ) ;
2023-04-16 10:58:48 +00:00
printf ( " [%4zu/%4zu] %36s - %16s, type = %6s, " ,
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
+ + idx , model_loader - > tensors_map . tensors . size ( ) ,
tensor . name . c_str ( ) , llama_format_tensor_shape ( tensor . ne ) . c_str ( ) ,
2023-04-14 18:05:37 +00:00
ggml_type_name ( tensor . type ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = tensor . name . rfind ( " weight " ) = = tensor . name . size ( ) - 6 ; // ends with 'weight'?
// quantize only 2D tensors
quantize & = ( tensor . ne . size ( ) = = 2 ) ;
2023-04-21 07:23:36 +00:00
// uncomment this to keep the output layer in FP16
//if (tensor.name == "output.weight") {
2023-04-20 20:32:59 +00:00
// quantize = false;
//}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
enum ggml_type new_type ;
void * new_data ;
size_t new_size ;
llama_buffer work ;
if ( ! quantize ) {
new_type = tensor . type ;
new_data = tensor . data ;
new_size = tensor . size ;
printf ( " size = %8.3f MB \n " , tensor . size / 1024.0 / 1024.0 ) ;
} else {
new_type = quantized_type ;
float * f32_data ;
size_t nelements = tensor . ne . at ( 0 ) * tensor . ne . at ( 1 ) ;
llama_buffer f32_conv_buf ;
if ( tensor . type = = GGML_TYPE_F32 ) {
f32_data = ( float * ) tensor . data ;
} else if ( tensor . type = = GGML_TYPE_F16 ) {
f32_conv_buf . resize ( nelements * sizeof ( float ) ) ;
f32_data = ( float * ) f32_conv_buf . addr ;
auto f16_data = ( const ggml_fp16_t * ) tensor . data ;
for ( size_t i = 0 ; i < nelements ; i + + ) {
f32_data [ i ] = ggml_fp16_to_fp32 ( f16_data [ i ] ) ;
2023-03-22 05:32:36 +00:00
}
} else {
2023-04-14 18:05:37 +00:00
throw format ( " type %s unsupported for integer quantization " , ggml_type_name ( tensor . type ) ) ;
2023-03-22 05:32:36 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " quantizing .. " ) ;
fflush ( stdout ) ;
work . resize ( nelements * 4 ) ; // upper bound on size
new_data = work . addr ;
std : : vector < int64_t > hist_cur ( 1 < < 4 , 0 ) ;
2023-04-20 17:42:27 +00:00
int chunk_size = 32 * 512 ;
const int nchunk = ( nelements + chunk_size - 1 ) / chunk_size ;
const int nthread_use = nthread > 1 ? std : : max ( 1 , std : : min ( nthread , nchunk ) ) : 1 ;
if ( nthread_use < 2 ) {
new_size = ggml_quantize_chunk ( new_type , f32_data , new_data , 0 , nelements , hist_cur . data ( ) ) ;
} else {
size_t counter = 0 ;
new_size = 0 ;
auto compute = [ & mutex , & counter , & hist_cur , & new_size , new_type , f32_data , new_data , nelements , chunk_size ] ( ) {
std : : vector < int64_t > local_hist ;
size_t local_size = 0 ;
while ( true ) {
std : : unique_lock < std : : mutex > lock ( mutex ) ;
size_t first = counter ; counter + = chunk_size ;
if ( first > = nelements ) {
if ( ! local_hist . empty ( ) ) {
for ( int j = 0 ; j < int ( local_hist . size ( ) ) ; + + j ) hist_cur [ j ] + = local_hist [ j ] ;
new_size + = local_size ;
}
break ;
}
lock . unlock ( ) ;
size_t last = std : : min ( nelements , first + chunk_size ) ;
if ( local_hist . empty ( ) ) local_hist . resize ( hist_cur . size ( ) , 0 ) ;
local_size + = ggml_quantize_chunk ( new_type , f32_data , new_data , first , last - first , local_hist . data ( ) ) ;
}
} ;
if ( int ( workers . size ( ) ) < nthread_use - 1 ) workers . resize ( nthread_use - 1 ) ;
for ( int it = 0 ; it < nthread_use - 1 ; + + it ) workers [ it ] = std : : thread ( compute ) ;
compute ( ) ;
for ( int it = 0 ; it < nthread_use - 1 ; + + it ) workers [ it ] . join ( ) ;
2023-03-22 05:32:36 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " size = %8.2f MB -> %8.2f MB | hist: " , tensor . size / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 ) ;
for ( size_t i = 0 ; i < hist_cur . size ( ) ; i + + ) {
hist_all [ i ] + = hist_cur [ i ] ;
2023-03-29 20:51:37 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
for ( size_t i = 0 ; i < hist_cur . size ( ) ; i + + ) {
printf ( " %5.3f " , hist_cur [ i ] / float ( nelements ) ) ;
2023-03-22 05:32:36 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " \n " ) ;
2023-03-22 05:32:36 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
total_size_org + = tensor . size ;
total_size_new + = new_size ;
file_saver . write_tensor ( tensor , new_type , new_data , new_size ) ;
}
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " %s: model size = %8.2f MB \n " , __func__ , total_size_org / 1024.0 / 1024.0 ) ;
printf ( " %s: quant size = %8.2f MB \n " , __func__ , total_size_new / 1024.0 / 1024.0 ) ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
{
int64_t sum_all = 0 ;
for ( size_t i = 0 ; i < hist_all . size ( ) ; i + + ) {
sum_all + = hist_all [ i ] ;
}
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " %s: hist: " , __func__ ) ;
for ( size_t i = 0 ; i < hist_all . size ( ) ; i + + ) {
printf ( " %5.3f " , hist_all [ i ] / float ( sum_all ) ) ;
2023-03-22 05:32:36 +00:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
printf ( " \n " ) ;
2023-03-22 05:32:36 +00:00
}
}
//
// interface implementation
//
struct llama_context * llama_init_from_file (
const char * path_model ,
struct llama_context_params params ) {
ggml_time_init ( ) ;
llama_context * ctx = new llama_context ;
2023-03-22 05:45:00 +00:00
if ( params . seed < = 0 ) {
params . seed = time ( NULL ) ;
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
unsigned cur_percentage = 0 ;
if ( params . progress_callback = = NULL ) {
params . progress_callback_user_data = & cur_percentage ;
params . progress_callback = [ ] ( float progress , void * ctx ) {
unsigned * cur_percentage_p = ( unsigned * ) ctx ;
unsigned percentage = ( unsigned ) ( 100 * progress ) ;
while ( percentage > * cur_percentage_p ) {
+ + * cur_percentage_p ;
fprintf ( stderr , " . " ) ;
fflush ( stderr ) ;
if ( percentage > = 100 ) {
fprintf ( stderr , " \n " ) ;
}
}
} ;
}
2023-03-22 05:32:36 +00:00
ctx - > rng = std : : mt19937 ( params . seed ) ;
ctx - > logits_all = params . logits_all ;
2023-03-24 21:17:37 +00:00
ggml_type memory_type = params . f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32 ;
2023-03-22 05:32:36 +00:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
if ( ! llama_model_load ( path_model , * ctx , params . n_ctx , memory_type ,
params . use_mmap , params . use_mlock , params . vocab_only ,
params . progress_callback , params . progress_callback_user_data ) ) {
2023-03-22 05:32:36 +00:00
fprintf ( stderr , " %s: failed to load model \n " , __func__ ) ;
2023-03-24 15:21:01 +00:00
llama_free ( ctx ) ;
2023-03-22 05:32:36 +00:00
return nullptr ;
}
2023-03-24 15:21:01 +00:00
2023-03-24 15:05:13 +00:00
// reserve memory for context buffers
2023-04-02 07:18:53 +00:00
if ( ! params . vocab_only ) {
2023-03-24 21:17:37 +00:00
if ( ! kv_cache_init ( ctx - > model . hparams , ctx - > model . kv_self , memory_type , ctx - > model . hparams . n_ctx ) ) {
fprintf ( stderr , " %s: kv_cache_init() failed for self-attention cache \n " , __func__ ) ;
llama_free ( ctx ) ;
return nullptr ;
}
{
const size_t memory_size = ggml_nbytes ( ctx - > model . kv_self . k ) + ggml_nbytes ( ctx - > model . kv_self . v ) ;
fprintf ( stderr , " %s: kv self size = %7.2f MB \n " , __func__ , memory_size / 1024.0 / 1024.0 ) ;
}
2023-03-24 15:05:13 +00:00
const auto & hparams = ctx - > model . hparams ;
2023-03-25 18:51:14 +00:00
// resized during inference
2023-03-24 15:05:13 +00:00
if ( params . logits_all ) {
ctx - > logits . reserve ( hparams . n_ctx * hparams . n_vocab ) ;
} else {
2023-04-22 06:21:32 +00:00
ctx - > logits . reserve ( hparams . n_vocab ) ;
2023-03-24 15:05:13 +00:00
}
if ( params . embedding ) {
2023-03-25 18:51:14 +00:00
ctx - > embedding . resize ( hparams . n_embd ) ;
2023-03-24 15:05:13 +00:00
}
2023-03-24 21:17:37 +00:00
2023-04-17 14:41:53 +00:00
ctx - > buf_compute . resize ( MEM_REQ_EVAL ( ) . at ( ctx - > model . type ) ) ;
2023-03-24 21:17:37 +00:00
2023-04-17 14:41:53 +00:00
ctx - > buf_scratch [ 0 ] . resize ( MEM_REQ_SCRATCH0 ( ) . at ( ctx - > model . type ) ) ;
ctx - > buf_scratch [ 1 ] . resize ( MEM_REQ_SCRATCH1 ( ) . at ( ctx - > model . type ) ) ;
2023-03-24 15:05:13 +00:00
}
2023-03-22 05:32:36 +00:00
return ctx ;
}
void llama_free ( struct llama_context * ctx ) {
delete ctx ;
}
int llama_model_quantize (
const char * fname_inp ,
const char * fname_out ,
2023-04-20 17:42:27 +00:00
enum llama_ftype ftype ,
int nthread ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
try {
2023-04-20 17:42:27 +00:00
llama_model_quantize_internal ( fname_inp , fname_out , ftype , nthread ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
return 0 ;
} catch ( const std : : string & err ) {
fprintf ( stderr , " %s: failed to quantize: %s \n " , __func__ , err . c_str ( ) ) ;
2023-03-22 05:32:36 +00:00
return 1 ;
}
}
2023-04-17 15:28:55 +00:00
int llama_apply_lora_from_file_internal ( struct llama_context * ctx , const char * path_lora , const char * path_base_model , int n_threads ) {
fprintf ( stderr , " %s: applying lora adapter from '%s' - please wait ... \n " , __func__ , path_lora ) ;
auto & model = ctx - > model ;
const int64_t t_start_lora_us = ggml_time_us ( ) ;
auto fin = std : : ifstream ( path_lora , std : : ios : : binary ) ;
if ( ! fin ) {
fprintf ( stderr , " %s: failed to open '%s' \n " , __func__ , path_lora ) ;
return 1 ;
}
// verify magic and version
{
uint32_t magic ;
fin . read ( ( char * ) & magic , sizeof ( magic ) ) ;
if ( magic ! = ' ggla ' ) {
fprintf ( stderr , " %s: bad file magic \n " , __func__ ) ;
return 1 ;
}
uint32_t format_version ;
fin . read ( ( char * ) & format_version , sizeof ( format_version ) ) ;
if ( format_version ! = 1 ) {
fprintf ( stderr , " %s: unsupported file version \n " , __func__ ) ;
return 1 ;
}
}
int32_t lora_r ;
int32_t lora_alpha ;
fin . read ( ( char * ) & lora_r , sizeof ( lora_r ) ) ;
fin . read ( ( char * ) & lora_alpha , sizeof ( lora_alpha ) ) ;
float scaling = ( float ) lora_alpha / ( float ) lora_r ;
fprintf ( stderr , " %s: r = %d, alpha = %d, scaling = %.2f \n " , __func__ , lora_r , lora_alpha , scaling ) ;
// create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor
std : : vector < uint8_t > lora_buf ( 1024ull * 1024ull * 1024ull ) ;
struct ggml_init_params params ;
params . mem_size = lora_buf . size ( ) ;
params . mem_buffer = lora_buf . data ( ) ;
params . no_alloc = false ;
ggml_context * lora_ctx = ggml_init ( params ) ;
std : : unordered_map < std : : string , struct ggml_tensor * > lora_tensors ;
// create a name -> tensor map of the model to accelerate lookups
std : : unordered_map < std : : string , struct ggml_tensor * > model_tensors ;
for ( auto & kv : model . tensors_by_name ) {
model_tensors . insert ( kv ) ;
}
// load base model
std : : unique_ptr < llama_model_loader > model_loader ;
ggml_context * base_ctx = NULL ;
llama_buffer base_buf ;
if ( path_base_model ) {
fprintf ( stderr , " %s: loading base model from '%s' \n " , __func__ , path_base_model ) ;
model_loader . reset ( new llama_model_loader ( path_base_model , /*use_mmap*/ true , /*vocab_only*/ false ) ) ;
size_t ctx_size , mmapped_size ;
model_loader - > calc_sizes ( & ctx_size , & mmapped_size ) ;
base_buf . resize ( ctx_size ) ;
ggml_init_params base_params ;
base_params . mem_size = base_buf . size ;
base_params . mem_buffer = base_buf . addr ;
base_params . no_alloc = model_loader - > use_mmap ;
base_ctx = ggml_init ( base_params ) ;
model_loader - > ggml_ctx = base_ctx ;
// maybe this should in llama_model_loader
if ( model_loader - > use_mmap ) {
model_loader - > mapping . reset ( new llama_mmap ( & model_loader - > file_loaders . at ( 0 ) - > file , /* prefetch */ false ) ) ;
}
}
// read tensors and apply
bool warned = false ;
int n_tensors = 0 ;
while ( true ) {
int32_t n_dims ;
int32_t length ;
int32_t ftype ;
fin . read ( reinterpret_cast < char * > ( & n_dims ) , sizeof ( n_dims ) ) ;
fin . read ( reinterpret_cast < char * > ( & length ) , sizeof ( length ) ) ;
fin . read ( reinterpret_cast < char * > ( & ftype ) , sizeof ( ftype ) ) ;
if ( fin . eof ( ) ) {
break ;
}
int32_t ne [ 2 ] = { 1 , 1 } ;
for ( int i = 0 ; i < n_dims ; + + i ) {
fin . read ( reinterpret_cast < char * > ( & ne [ i ] ) , sizeof ( ne [ i ] ) ) ;
}
std : : string name ( length , 0 ) ;
fin . read ( & name [ 0 ] , length ) ;
// check for lora suffix and get the type of tensor
const std : : string lora_suffix = " .lora " ;
size_t pos = name . rfind ( lora_suffix ) ;
if ( pos = = std : : string : : npos ) {
fprintf ( stderr , " %s: error: '%s' is not a lora tensor \n " , __func__ , name . c_str ( ) ) ;
return 1 ;
}
std : : string lora_type = name . substr ( pos + lora_suffix . length ( ) ) ;
std : : string base_name = name ;
base_name . erase ( pos ) ;
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
if ( model_tensors . find ( base_name . data ( ) ) = = model_tensors . end ( ) ) {
fprintf ( stderr , " %s: unknown tensor '%s' in lora adapter \n " , __func__ , name . data ( ) ) ;
return 1 ;
}
// create ggml tensor
ggml_type wtype ;
switch ( ftype ) {
case 0 : wtype = GGML_TYPE_F32 ; break ;
case 1 : wtype = GGML_TYPE_F16 ; break ;
default :
{
fprintf ( stderr , " %s: invalid tensor data type '%d' \n " ,
__func__ , ftype ) ;
return false ;
}
}
ggml_tensor * lora_tensor ;
if ( n_dims = = 2 ) {
lora_tensor = ggml_new_tensor_2d ( lora_ctx , wtype , ne [ 0 ] , ne [ 1 ] ) ;
}
else {
fprintf ( stderr , " %s: unsupported tensor dimension %d \n " , __func__ , n_dims ) ;
return 1 ;
}
// load tensor data
size_t offset = fin . tellg ( ) ;
size_t tensor_data_size = ggml_nbytes ( lora_tensor ) ;
offset = ( offset + 31 ) & - 32 ;
fin . seekg ( offset ) ;
fin . read ( ( char * ) lora_tensor - > data , tensor_data_size ) ;
lora_tensors [ name ] = lora_tensor ;
// check if we have both A and B tensors and apply
if ( lora_tensors . find ( base_name + " .loraA " ) ! = lora_tensors . end ( ) & &
lora_tensors . find ( base_name + " .loraB " ) ! = lora_tensors . end ( ) ) {
ggml_tensor * dest_t = model_tensors [ base_name ] ;
ggml_tensor * base_t ;
if ( model_loader ) {
// load from base model
if ( model_loader - > tensors_map . name_to_idx . find ( base_name ) = = model_loader - > tensors_map . name_to_idx . end ( ) ) {
fprintf ( stderr , " %s: error: tensor '%s' not found in base model \n " , __func__ , base_name . c_str ( ) ) ;
return 1 ;
}
size_t idx = model_loader - > tensors_map . name_to_idx [ base_name ] ;
llama_load_tensor & lt = model_loader - > tensors_map . tensors [ idx ] ;
base_t = model_loader - > get_tensor ( base_name , { ( uint32_t ) dest_t - > ne [ 0 ] , ( uint32_t ) dest_t - > ne [ 1 ] } ) ;
lt . data = ( uint8_t * ) lt . ggml_tensor - > data ;
model_loader - > load_data_for ( lt ) ;
lt . ggml_tensor - > data = lt . data ;
}
else {
base_t = dest_t ;
}
2023-04-20 17:35:53 +00:00
if ( ggml_is_quantized ( base_t - > type ) ) {
2023-04-17 15:28:55 +00:00
if ( ! warned ) {
fprintf ( stderr , " %s: warning: using a lora adapter with a quantized model may result in poor quality, "
" use a f16 or f32 base model with --lora-base \n " , __func__ ) ;
warned = true ;
}
}
ggml_tensor * loraA = lora_tensors [ base_name + " .loraA " ] ;
ggml_tensor * loraB = lora_tensors [ base_name + " .loraB " ] ;
if ( base_t - > ne [ 0 ] ! = loraA - > ne [ 1 ] | | base_t - > ne [ 1 ] ! = loraB - > ne [ 1 ] ) {
fprintf ( stderr , " %s: incompatible tensor dimensions (% " PRId64 " and % " PRId64 " ); "
" are you sure that this adapter is for this model? \n " , __func__ , base_t - > ne [ 0 ] , loraA - > ne [ 1 ] ) ;
return 1 ;
}
// w = w + BA*s
ggml_tensor * BA = ggml_mul_mat ( lora_ctx , loraA , loraB ) ;
if ( scaling ! = 1.0f ) {
ggml_tensor * scale_tensor = ggml_new_f32 ( lora_ctx , scaling ) ;
BA = ggml_scale ( lora_ctx , BA , scale_tensor ) ;
}
ggml_tensor * r ;
if ( base_t = = dest_t ) {
r = ggml_add_inplace ( lora_ctx , dest_t , BA ) ;
}
else {
r = ggml_add ( lora_ctx , base_t , BA ) ;
r = ggml_cpy ( lora_ctx , r , dest_t ) ;
}
struct ggml_cgraph gf = ggml_build_forward ( r ) ;
gf . n_threads = n_threads ;
ggml_graph_compute ( lora_ctx , & gf ) ;
// we won't need these tensors again, reset the context to save memory
ggml_free ( lora_ctx ) ;
lora_ctx = ggml_init ( params ) ;
lora_tensors . clear ( ) ;
n_tensors + + ;
if ( n_tensors % 4 = = 0 )
fprintf ( stderr , " . " ) ;
}
}
// TODO: this should be in a destructor, it will leak on failure
ggml_free ( lora_ctx ) ;
if ( base_ctx ) {
ggml_free ( base_ctx ) ;
}
const int64_t t_lora_us = ggml_time_us ( ) - t_start_lora_us ;
fprintf ( stderr , " done (%.2f ms) \n " , t_lora_us / 1000.0 ) ;
return 0 ;
}
int llama_apply_lora_from_file ( struct llama_context * ctx , const char * path_lora , const char * path_base_model , int n_threads ) {
try {
return llama_apply_lora_from_file_internal ( ctx , path_lora , path_base_model , n_threads ) ;
} catch ( const std : : string & err ) {
fprintf ( stderr , " %s: failed to apply lora adapter: %s \n " , __func__ , err . c_str ( ) ) ;
return 1 ;
}
}
2023-04-24 04:40:02 +00:00
int llama_get_kv_cache_token_count ( struct llama_context * ctx ) {
return ctx - > model . kv_self . n ;
2023-04-02 10:23:04 +00:00
}
2023-04-24 04:40:02 +00:00
# define LLAMA_MAX_RNG_STATE 64*1024
2023-04-26 20:08:43 +00:00
void llama_set_rng_seed ( struct llama_context * ctx , int seed ) {
if ( seed < = 0 ) {
seed = time ( NULL ) ;
}
ctx - > rng . seed ( seed ) ;
}
2023-04-24 04:40:02 +00:00
// Returns the size of the state
size_t llama_get_state_size ( struct llama_context * ctx ) {
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
// for reference, std::mt19937(1337) serializes to 6701 bytes.
const size_t s_rng_size = sizeof ( size_t ) ;
const size_t s_rng = LLAMA_MAX_RNG_STATE ;
const size_t s_logits_capacity = sizeof ( size_t ) ;
const size_t s_logits_size = sizeof ( size_t ) ;
const size_t s_logits = ctx - > logits . capacity ( ) * sizeof ( float ) ;
const size_t s_embedding_size = sizeof ( size_t ) ;
const size_t s_embedding = ctx - > embedding . size ( ) * sizeof ( float ) ;
const size_t s_kv_size = sizeof ( size_t ) ;
const size_t s_kv_ntok = sizeof ( int ) ;
const size_t s_kv = ctx - > model . kv_self . buf . size ;
const size_t s_total = (
+ s_rng_size
+ s_rng
+ s_logits_capacity
+ s_logits_size
+ s_logits
+ s_embedding_size
+ s_embedding
+ s_kv_size
+ s_kv_ntok
+ s_kv
) ;
return s_total ;
2023-04-02 10:23:04 +00:00
}
2023-04-24 04:40:02 +00:00
// Copies the state to the specified destination address
size_t llama_copy_state_data ( struct llama_context * ctx , uint8_t * dest ) {
uint8_t * out = dest ;
// copy rng
{
std : : stringstream rng_ss ;
rng_ss < < ctx - > rng ;
const size_t rng_size = rng_ss . str ( ) . size ( ) ;
char rng_buf [ LLAMA_MAX_RNG_STATE ] ;
memset ( & rng_buf [ 0 ] , 0 , LLAMA_MAX_RNG_STATE ) ;
memcpy ( & rng_buf [ 0 ] , rng_ss . str ( ) . data ( ) , rng_ss . str ( ) . size ( ) ) ;
memcpy ( out , & rng_size , sizeof ( rng_size ) ) ; out + = sizeof ( rng_size ) ;
memcpy ( out , & rng_buf [ 0 ] , LLAMA_MAX_RNG_STATE ) ; out + = LLAMA_MAX_RNG_STATE ;
}
// copy logits
{
const size_t logits_cap = ctx - > logits . capacity ( ) ;
const size_t logits_size = ctx - > logits . size ( ) ;
memcpy ( out , & logits_cap , sizeof ( logits_cap ) ) ; out + = sizeof ( logits_cap ) ;
memcpy ( out , & logits_size , sizeof ( logits_size ) ) ; out + = sizeof ( logits_size ) ;
if ( logits_size ) {
memcpy ( out , ctx - > logits . data ( ) , logits_size * sizeof ( float ) ) ;
}
out + = logits_cap * sizeof ( float ) ;
}
// copy embeddings
{
const size_t embedding_size = ctx - > embedding . size ( ) ;
memcpy ( out , & embedding_size , sizeof ( embedding_size ) ) ; out + = sizeof ( embedding_size ) ;
if ( embedding_size ) {
memcpy ( out , ctx - > embedding . data ( ) , embedding_size * sizeof ( float ) ) ;
out + = embedding_size * sizeof ( float ) ;
}
}
// copy kv cache
{
const size_t kv_size = ctx - > model . kv_self . buf . size ;
const int kv_ntok = llama_get_kv_cache_token_count ( ctx ) ;
memcpy ( out , & kv_size , sizeof ( kv_size ) ) ; out + = sizeof ( kv_size ) ;
memcpy ( out , & kv_ntok , sizeof ( kv_ntok ) ) ; out + = sizeof ( kv_ntok ) ;
if ( kv_size ) {
memcpy ( out , ctx - > model . kv_self . buf . addr , kv_size ) ; out + = kv_size ;
}
}
const size_t written = out - dest ;
const size_t expected = llama_get_state_size ( ctx ) ;
LLAMA_ASSERT ( written = = expected ) ;
return written ;
2023-04-02 10:23:04 +00:00
}
2023-04-24 04:40:02 +00:00
// Sets the state reading from the specified source address
size_t llama_set_state_data ( struct llama_context * ctx , const uint8_t * src ) {
const uint8_t * in = src ;
// set rng
{
size_t rng_size ;
char rng_buf [ LLAMA_MAX_RNG_STATE ] ;
memcpy ( & rng_size , in , sizeof ( rng_size ) ) ; in + = sizeof ( rng_size ) ;
memcpy ( & rng_buf [ 0 ] , in , LLAMA_MAX_RNG_STATE ) ; in + = LLAMA_MAX_RNG_STATE ;
std : : stringstream rng_ss ;
rng_ss . str ( std : : string ( & rng_buf [ 0 ] , rng_size ) ) ;
rng_ss > > ctx - > rng ;
LLAMA_ASSERT ( rng_ss . fail ( ) = = false ) ;
}
// set logits
{
size_t logits_cap ;
size_t logits_size ;
memcpy ( & logits_cap , in , sizeof ( logits_cap ) ) ; in + = sizeof ( logits_cap ) ;
memcpy ( & logits_size , in , sizeof ( logits_size ) ) ; in + = sizeof ( logits_size ) ;
LLAMA_ASSERT ( ctx - > logits . capacity ( ) = = logits_cap ) ;
if ( logits_size ) {
ctx - > logits . resize ( logits_size ) ;
memcpy ( ctx - > logits . data ( ) , in , logits_size * sizeof ( float ) ) ;
}
in + = logits_cap * sizeof ( float ) ;
}
// set embeddings
{
size_t embedding_size ;
memcpy ( & embedding_size , in , sizeof ( embedding_size ) ) ; in + = sizeof ( embedding_size ) ;
LLAMA_ASSERT ( ctx - > embedding . capacity ( ) = = embedding_size ) ;
if ( embedding_size ) {
memcpy ( ctx - > embedding . data ( ) , in , embedding_size * sizeof ( float ) ) ;
in + = embedding_size * sizeof ( float ) ;
}
}
// set kv cache
{
size_t kv_size ;
int kv_ntok ;
memcpy ( & kv_size , in , sizeof ( kv_size ) ) ; in + = sizeof ( kv_size ) ;
memcpy ( & kv_ntok , in , sizeof ( kv_ntok ) ) ; in + = sizeof ( kv_ntok ) ;
if ( kv_size ) {
LLAMA_ASSERT ( ctx - > model . kv_self . buf . size = = kv_size ) ;
void * k_data = ctx - > model . kv_self . k - > data ; // remember data pointers
void * v_data = ctx - > model . kv_self . v - > data ; // because their value is stored in buf and overwritten by memcpy
memcpy ( ctx - > model . kv_self . buf . addr , in , kv_size ) ; in + = kv_size ;
ctx - > model . kv_self . k - > data = k_data ; // restore correct data pointers
ctx - > model . kv_self . v - > data = v_data ;
}
ctx - > model . kv_self . n = kv_ntok ;
}
const size_t nread = in - src ;
const size_t expected = llama_get_state_size ( ctx ) ;
LLAMA_ASSERT ( nread = = expected ) ;
return nread ;
2023-04-02 10:23:04 +00:00
}
2023-03-22 05:32:36 +00:00
int llama_eval (
struct llama_context * ctx ,
const llama_token * tokens ,
int n_tokens ,
int n_past ,
int n_threads ) {
if ( ! llama_eval_internal ( * ctx , tokens , n_tokens , n_past , n_threads ) ) {
fprintf ( stderr , " %s: failed to eval \n " , __func__ ) ;
return 1 ;
}
2023-03-29 20:51:37 +00:00
// get a more accurate load time, upon first eval
if ( ! ctx - > has_evaluated_once ) {
ctx - > t_load_us = ggml_time_us ( ) - ctx - > t_start_us ;
ctx - > has_evaluated_once = true ;
}
2023-03-22 05:32:36 +00:00
return 0 ;
}
int llama_tokenize (
struct llama_context * ctx ,
const char * text ,
llama_token * tokens ,
int n_max_tokens ,
bool add_bos ) {
auto res = llama_tokenize ( ctx - > vocab , text , add_bos ) ;
if ( n_max_tokens < ( int ) res . size ( ) ) {
fprintf ( stderr , " %s: too many tokens \n " , __func__ ) ;
return - ( ( int ) res . size ( ) ) ;
}
for ( size_t i = 0 ; i < res . size ( ) ; i + + ) {
tokens [ i ] = res [ i ] ;
}
return res . size ( ) ;
}
int llama_n_vocab ( struct llama_context * ctx ) {
return ctx - > vocab . id_to_token . size ( ) ;
}
int llama_n_ctx ( struct llama_context * ctx ) {
return ctx - > model . hparams . n_ctx ;
}
2023-03-25 18:51:14 +00:00
int llama_n_embd ( struct llama_context * ctx ) {
return ctx - > model . hparams . n_embd ;
}
2023-03-22 05:32:36 +00:00
float * llama_get_logits ( struct llama_context * ctx ) {
return ctx - > logits . data ( ) ;
}
2023-03-24 15:05:13 +00:00
float * llama_get_embeddings ( struct llama_context * ctx ) {
return ctx - > embedding . data ( ) ;
}
2023-03-22 05:32:36 +00:00
const char * llama_token_to_str ( struct llama_context * ctx , llama_token token ) {
if ( token > = llama_n_vocab ( ctx ) ) {
return nullptr ;
}
return ctx - > vocab . id_to_token [ token ] . tok . c_str ( ) ;
}
llama_token llama_token_bos ( ) {
return 1 ;
}
llama_token llama_token_eos ( ) {
return 2 ;
}
llama_token llama_sample_top_p_top_k (
llama_context * ctx ,
const llama_token * last_n_tokens_data ,
int last_n_tokens_size ,
int top_k ,
2023-03-28 16:48:20 +00:00
float top_p ,
float temp ,
float repeat_penalty ) {
2023-03-22 05:32:36 +00:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
llama_token result = 0 ;
// TODO: avoid this ...
const auto last_n_tokens = std : : vector < llama_token > ( last_n_tokens_data , last_n_tokens_data + last_n_tokens_size ) ;
result = llama_sample_top_p_top_k (
* ctx ,
last_n_tokens ,
top_k ,
top_p ,
temp ,
repeat_penalty ) ;
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > n_sample + + ;
return result ;
}
void llama_print_timings ( struct llama_context * ctx ) {
const int64_t t_end_us = ggml_time_us ( ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
const int32_t n_sample = std : : max ( 1 , ctx - > n_sample ) ;
const int32_t n_eval = std : : max ( 1 , ctx - > n_eval ) ;
const int32_t n_p_eval = std : : max ( 1 , ctx - > n_p_eval ) ;
2023-03-22 05:32:36 +00:00
fprintf ( stderr , " \n " ) ;
2023-03-28 16:48:20 +00:00
fprintf ( stderr , " %s: load time = %8.2f ms \n " , __func__ , ctx - > t_load_us / 1000.0 ) ;
fprintf ( stderr , " %s: sample time = %8.2f ms / %5d runs (%8.2f ms per run) \n " , __func__ , 1e-3 * ctx - > t_sample_us , n_sample , 1e-3 * ctx - > t_sample_us / n_sample ) ;
fprintf ( stderr , " %s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token) \n " , __func__ , 1e-3 * ctx - > t_p_eval_us , n_p_eval , 1e-3 * ctx - > t_p_eval_us / n_p_eval ) ;
fprintf ( stderr , " %s: eval time = %8.2f ms / %5d runs (%8.2f ms per run) \n " , __func__ , 1e-3 * ctx - > t_eval_us , n_eval , 1e-3 * ctx - > t_eval_us / n_eval ) ;
fprintf ( stderr , " %s: total time = %8.2f ms \n " , __func__ , ( t_end_us - ctx - > t_start_us ) / 1000.0 ) ;
2023-03-22 05:32:36 +00:00
}
void llama_reset_timings ( struct llama_context * ctx ) {
ctx - > t_start_us = ggml_time_us ( ) ;
ctx - > t_sample_us = ctx - > n_sample = 0 ;
ctx - > t_eval_us = ctx - > n_eval = 0 ;
2023-03-25 14:34:23 +00:00
ctx - > t_p_eval_us = ctx - > n_p_eval = 0 ;
2023-03-22 05:32:36 +00:00
}
const char * llama_print_system_info ( void ) {
static std : : string s ;
s = " " ;
2023-04-17 13:10:57 +00:00
s + = " AVX = " + std : : to_string ( ggml_cpu_has_avx ( ) ) + " | " ;
s + = " AVX2 = " + std : : to_string ( ggml_cpu_has_avx2 ( ) ) + " | " ;
s + = " AVX512 = " + std : : to_string ( ggml_cpu_has_avx512 ( ) ) + " | " ;
s + = " AVX512_VBMI = " + std : : to_string ( ggml_cpu_has_avx512_vbmi ( ) ) + " | " ;
s + = " AVX512_VNNI = " + std : : to_string ( ggml_cpu_has_avx512_vnni ( ) ) + " | " ;
s + = " FMA = " + std : : to_string ( ggml_cpu_has_fma ( ) ) + " | " ;
s + = " NEON = " + std : : to_string ( ggml_cpu_has_neon ( ) ) + " | " ;
s + = " ARM_FMA = " + std : : to_string ( ggml_cpu_has_arm_fma ( ) ) + " | " ;
s + = " F16C = " + std : : to_string ( ggml_cpu_has_f16c ( ) ) + " | " ;
s + = " FP16_VA = " + std : : to_string ( ggml_cpu_has_fp16_va ( ) ) + " | " ;
s + = " WASM_SIMD = " + std : : to_string ( ggml_cpu_has_wasm_simd ( ) ) + " | " ;
s + = " BLAS = " + std : : to_string ( ggml_cpu_has_blas ( ) ) + " | " ;
s + = " SSE3 = " + std : : to_string ( ggml_cpu_has_sse3 ( ) ) + " | " ;
s + = " VSX = " + std : : to_string ( ggml_cpu_has_vsx ( ) ) + " | " ;
2023-03-22 05:32:36 +00:00
return s . c_str ( ) ;
}
2023-04-07 22:09:18 +00:00
// For internal test use
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 19:24:37 +00:00
std : : vector < std : : pair < std : : string , struct ggml_tensor * > > & llama_internal_get_tensor_map ( struct llama_context * ctx ) {
return ctx - > model . tensors_by_name ;
2023-04-07 22:09:18 +00:00
}
2023-04-22 06:21:32 +00:00
2023-04-28 15:59:37 +00:00
size_t llama_load_session_file ( struct llama_context * ctx , const char * path_session , llama_token * tokens_out , size_t n_token_capacity , size_t * n_token_count_out ) {
// TODO leverage mmap
llama_file file ( path_session , " rb " ) ;
const uint32_t magic = file . read_u32 ( ) ;
const uint32_t version = file . read_u32 ( ) ;
if ( ! ( magic = = ' ggsn ' & & version = = 0 ) ) {
fprintf ( stderr , " %s : unknown (magic, version) for session file: %08x, %08x \n " , __func__ , magic , version ) ;
return 0 ;
}
llama_hparams session_hparams ;
file . read_raw ( & session_hparams , sizeof ( llama_hparams ) ) ;
// REVIEW
if ( session_hparams ! = ctx - > model . hparams ) {
fprintf ( stderr , " %s : model hparams didn't match from session file! \n " , __func__ ) ;
return 0 ;
}
const uint32_t n_token_count = file . read_u32 ( ) ;
LLAMA_ASSERT ( n_token_capacity > = n_token_count ) ;
file . read_raw ( tokens_out , sizeof ( llama_token ) * n_token_count ) ;
* n_token_count_out = n_token_count ;
const size_t n_state_size = file . size - file . tell ( ) ;
const size_t n_orig_state_size = llama_get_state_size ( ctx ) ;
if ( n_state_size ! = n_orig_state_size ) {
fprintf ( stderr , " %s : failed to validate state size \n " , __func__ ) ;
}
std : : unique_ptr < uint8_t [ ] > state_data ( new uint8_t [ n_state_size ] ) ;
file . read_raw ( state_data . get ( ) , n_state_size ) ;
return llama_set_state_data ( ctx , state_data . get ( ) ) ;
}
size_t llama_save_session_file ( struct llama_context * ctx , const char * path_session , const llama_token * tokens , size_t n_token_count ) {
// TODO save temp & swap
llama_file file ( path_session , " wb " ) ;
const size_t n_state_size = llama_get_state_size ( ctx ) ;
std : : unique_ptr < uint8_t [ ] > state_data ( new uint8_t [ n_state_size ] ) ;
llama_copy_state_data ( ctx , state_data . get ( ) ) ;
file . write_u32 ( ' ggsn ' ) ; // magic
file . write_u32 ( 0 ) ; // version
file . write_raw ( & ctx - > model . hparams , sizeof ( llama_hparams ) ) ;
file . write_u32 ( ( uint32_t ) n_token_count ) ; // REVIEW
file . write_raw ( tokens , sizeof ( llama_token ) * n_token_count ) ;
file . write_raw ( state_data . get ( ) , n_state_size ) ;
return n_state_size ; // REVIEW
}