mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
convert : update for baichuan (#2081)
1. guess n_layers; 2. relax warnings on context size; 3. add a note that its derivations are also supported. Co-authored-by: Judd <foldl@boxvest.com>
This commit is contained in:
parent
a17a2683d8
commit
36680f6e40
@ -86,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||||
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
||||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
|
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -154,9 +154,15 @@ class Params:
|
|||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
|
if n_layer < 1:
|
||||||
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
n_head=n_embd // 128 # guessed
|
n_head=n_embd // 128 # guessed
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
|
@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
} else if (params.n_ctx < 8) {
|
} else if (params.n_ctx < 8) {
|
||||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
|
@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ Command line options:
|
|||||||
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
Loading…
Reference in New Issue
Block a user