mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
Merge remote-tracking branch 'origin/master' into tool-call
This commit is contained in:
commit
33ea20edd1
@ -1,7 +1,7 @@
|
||||
*.o
|
||||
*.a
|
||||
.cache/
|
||||
.git/
|
||||
# Do not ignore .git directory, otherwise the reported build number will always be 0
|
||||
.github/
|
||||
.gitignore
|
||||
.vs/
|
||||
|
58
.github/workflows/docker.yml
vendored
58
.github/workflows/docker.yml
vendored
@ -15,11 +15,17 @@ on:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||
workflow_dispatch: # allows manual triggering, useful for debugging
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
@ -46,6 +52,8 @@ jobs:
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
@ -60,6 +68,34 @@ jobs:
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
|
||||
# determine tag name postfix (build number, commit hash)
|
||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||
TAG_POSTFIX="b${BUILD_NUMBER}"
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
|
||||
fi
|
||||
|
||||
# list all tags possible
|
||||
TAGS=""
|
||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
|
||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
|
||||
|
||||
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
|
||||
echo "output_tags=$TAGS" # print out for debugging
|
||||
env:
|
||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
@ -77,25 +113,6 @@ jobs:
|
||||
docker-images: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Downcase github.repository_owner
|
||||
run: |
|
||||
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Build and push Docker image (tagged + versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v6
|
||||
@ -103,5 +120,6 @@ jobs:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params) {
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
||||
add_opt(llama_arg(
|
||||
{"--chunks"}, "N",
|
||||
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
||||
@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
||||
add_opt(llama_arg(
|
||||
{"--attention"}, "{causal,non,causal}",
|
||||
"attention type for embeddings, use model default if unspecified",
|
||||
@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
|
||||
add_opt(llama_arg(
|
||||
{"--rope-scale"}, "N",
|
||||
"RoPE context scaling factor, expands context by a factor of N",
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.rope_freq_scale = 1.0f / std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_ROPE_SCALE"));
|
||||
add_opt(llama_arg(
|
||||
{"--rope-freq-base"}, "N",
|
||||
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.rope_freq_base = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
|
||||
add_opt(llama_arg(
|
||||
{"--rope-freq-scale"}, "N",
|
||||
"RoPE frequency scaling factor, expands context by a factor of 1/N",
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.rope_freq_scale = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
|
||||
add_opt(llama_arg(
|
||||
{"--yarn-orig-ctx"}, "N",
|
||||
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
||||
[](gpt_params & params, int value) {
|
||||
params.yarn_orig_ctx = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
||||
add_opt(llama_arg(
|
||||
{"--yarn-ext-factor"}, "N",
|
||||
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.yarn_ext_factor = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
||||
add_opt(llama_arg(
|
||||
{"--yarn-attn-factor"}, "N",
|
||||
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.yarn_attn_factor = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
||||
add_opt(llama_arg(
|
||||
{"--yarn-beta-slow"}, "N",
|
||||
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.yarn_beta_slow = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
||||
add_opt(llama_arg(
|
||||
{"--yarn-beta-fast"}, "N",
|
||||
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.yarn_beta_fast = std::stof(value);
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
|
||||
add_opt(llama_arg(
|
||||
{"-gan", "--grp-attn-n"}, "N",
|
||||
format("group-attention factor (default: %d)", params.grp_attn_n),
|
||||
[](gpt_params & params, int value) {
|
||||
params.grp_attn_n = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_GRP_ATTN_N"));
|
||||
add_opt(llama_arg(
|
||||
{"-gaw", "--grp-attn-w"}, "N",
|
||||
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
|
||||
[](gpt_params & params, int value) {
|
||||
params.grp_attn_w = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_GRP_ATTN_W"));
|
||||
add_opt(llama_arg(
|
||||
{"-dkvc", "--dump-kv-cache"},
|
||||
"verbose print of the KV cache",
|
||||
@ -1205,7 +1205,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params) {
|
||||
params.no_kv_offload = true;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||
add_opt(llama_arg(
|
||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
||||
@ -1213,7 +1213,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
// TODO: get the type right here
|
||||
params.cache_type_k = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
||||
add_opt(llama_arg(
|
||||
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
||||
@ -1221,7 +1221,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
// TODO: get the type right here
|
||||
params.cache_type_v = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||
add_opt(llama_arg(
|
||||
{"--perplexity", "--all-logits"},
|
||||
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
||||
@ -1355,7 +1355,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.rpc_servers = value;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_RPC"));
|
||||
#endif
|
||||
add_opt(llama_arg(
|
||||
{"--mlock"},
|
||||
@ -1363,14 +1363,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params) {
|
||||
params.use_mlock = true;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_MLOCK"));
|
||||
add_opt(llama_arg(
|
||||
{"--no-mmap"},
|
||||
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
||||
[](gpt_params & params) {
|
||||
params.use_mmap = false;
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_NO_MMAP"));
|
||||
add_opt(llama_arg(
|
||||
{"--numa"}, "TYPE",
|
||||
"attempt optimizations that help on some NUMA systems\n"
|
||||
@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_NUMA"));
|
||||
add_opt(llama_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_SPLIT_MODE"));
|
||||
add_opt(llama_arg(
|
||||
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
|
||||
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
|
||||
@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
|
||||
add_opt(llama_arg(
|
||||
{"-mg", "--main-gpu"}, "INDEX",
|
||||
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
||||
@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
).set_env("LLAMA_ARG_MAIN_GPU"));
|
||||
add_opt(llama_arg(
|
||||
{"--check-tensors"},
|
||||
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
||||
@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.model_alias = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
|
||||
add_opt(llama_arg(
|
||||
{"-m", "--model"}, "FNAME",
|
||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||
@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.public_path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
||||
add_opt(llama_arg(
|
||||
{"--embedding", "--embeddings"},
|
||||
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
||||
@ -1779,14 +1779,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.ssl_file_key = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
|
||||
add_opt(llama_arg(
|
||||
{"--ssl-cert-file"}, "FNAME",
|
||||
"path to file a PEM-encoded SSL certificate",
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.ssl_file_cert = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
||||
add_opt(llama_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@ -1794,7 +1794,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
params.timeout_read = value;
|
||||
params.timeout_write = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
||||
add_opt(llama_arg(
|
||||
{"--threads-http"}, "N",
|
||||
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
||||
|
@ -4102,16 +4102,45 @@ class GraniteModel(LlamaModel):
|
||||
# consistency
|
||||
if attention_scale := self.hparams.get("attention_multiplier"):
|
||||
self.gguf_writer.add_attention_scale(attention_scale)
|
||||
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
||||
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
||||
self.gguf_writer.add_embedding_scale(embedding_scale)
|
||||
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
||||
if residual_scale := self.hparams.get("residual_multiplier"):
|
||||
self.gguf_writer.add_residual_scale(residual_scale)
|
||||
if logits_scaling := self.hparams.get("logits_scaling"):
|
||||
self.gguf_writer.add_logit_scale(logits_scaling)
|
||||
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
||||
if logits_scale := self.hparams.get("logits_scaling"):
|
||||
self.gguf_writer.add_logit_scale(logits_scale)
|
||||
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||
|
||||
|
||||
@Model.register("GraniteMoeForCausalLM")
|
||||
class GraniteMoeModel(GraniteModel):
|
||||
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
||||
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
||||
the hidden size that is then split during forward. To keep compatibility
|
||||
with existing mixtral support, we pull them apart here.
|
||||
"""
|
||||
|
||||
if name.endswith("block_sparse_moe.input_linear.weight"):
|
||||
ffn_dim = self.hparams["intermediate_size"]
|
||||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
||||
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
||||
]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
_tensor_type = torch.Tensor
|
||||
|
@ -6,42 +6,73 @@
|
||||
|
||||
// Export usage message (-h) to markdown format
|
||||
|
||||
static void write_table_header(std::ofstream & file) {
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
}
|
||||
|
||||
static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
|
||||
file << "| `";
|
||||
// args
|
||||
for (const auto & arg : opt.args) {
|
||||
if (arg == opt.args.front()) {
|
||||
file << arg;
|
||||
if (opt.args.size() > 1) file << ", ";
|
||||
} else {
|
||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
||||
}
|
||||
}
|
||||
// value hint
|
||||
if (opt.value_hint) {
|
||||
std::string md_value_hint(opt.value_hint);
|
||||
string_replace_all(md_value_hint, "|", "\\|");
|
||||
file << " " << md_value_hint;
|
||||
}
|
||||
if (opt.value_hint_2) {
|
||||
std::string md_value_hint_2(opt.value_hint_2);
|
||||
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||
file << " " << md_value_hint_2;
|
||||
}
|
||||
// help text
|
||||
std::string md_help(opt.help);
|
||||
string_replace_all(md_help, "\n", "<br/>");
|
||||
string_replace_all(md_help, "|", "\\|");
|
||||
file << "` | " << md_help << " |\n";
|
||||
}
|
||||
|
||||
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
|
||||
write_table_header(file);
|
||||
for (const auto & opt : opts) {
|
||||
write_table_entry(file, *opt);
|
||||
}
|
||||
}
|
||||
|
||||
static void export_md(std::string fname, llama_example ex) {
|
||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||
|
||||
gpt_params params;
|
||||
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
std::vector<llama_arg *> common_options;
|
||||
std::vector<llama_arg *> sparam_options;
|
||||
std::vector<llama_arg *> specific_options;
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
file << "| `";
|
||||
// args
|
||||
for (const auto & arg : opt.args) {
|
||||
if (arg == opt.args.front()) {
|
||||
file << arg;
|
||||
if (opt.args.size() > 1) file << ", ";
|
||||
} else {
|
||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
||||
}
|
||||
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
if (opt.is_sparam) {
|
||||
sparam_options.push_back(&opt);
|
||||
} else if (opt.in_example(ctx_arg.ex)) {
|
||||
specific_options.push_back(&opt);
|
||||
} else {
|
||||
common_options.push_back(&opt);
|
||||
}
|
||||
// value hint
|
||||
if (opt.value_hint) {
|
||||
std::string md_value_hint(opt.value_hint);
|
||||
string_replace_all(md_value_hint, "|", "\\|");
|
||||
file << " " << md_value_hint;
|
||||
}
|
||||
if (opt.value_hint_2) {
|
||||
std::string md_value_hint_2(opt.value_hint_2);
|
||||
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||
file << " " << md_value_hint_2;
|
||||
}
|
||||
// help text
|
||||
std::string md_help(opt.help);
|
||||
string_replace_all(md_help, "\n", "<br/>");
|
||||
string_replace_all(md_help, "|", "\\|");
|
||||
file << "` | " << md_help << " |\n";
|
||||
}
|
||||
|
||||
file << "**Common params**\n\n";
|
||||
write_table(file, common_options);
|
||||
file << "\n\n**Sampling params**\n\n";
|
||||
write_table(file, sparam_options);
|
||||
file << "\n\n**Example-specific params**\n\n";
|
||||
write_table(file, specific_options);
|
||||
}
|
||||
|
||||
int main(int, char **) {
|
||||
|
@ -17,6 +17,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
## Usage
|
||||
|
||||
**Common params**
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `-h, --help, --usage` | print usage and exit |
|
||||
@ -38,7 +40,6 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||
@ -46,8 +47,56 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
||||
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
|
||||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
|
||||
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
|
||||
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
|
||||
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
|
||||
|
||||
|
||||
**Sampling params**
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||
@ -73,54 +122,27 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||
| `--jinja` | Enable (limited) Jinja templating engine, which is needed for tool use. |
|
||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
|
||||
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
|
||||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
|
||||
| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
|
||||
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
|
||||
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
|
||||
**Example-specific params**
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: ) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||
@ -129,14 +151,6 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
|
||||
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
|
||||
|
@ -2361,6 +2361,10 @@ int main(int argc, char ** argv) {
|
||||
svr.reset(new httplib::Server());
|
||||
}
|
||||
#else
|
||||
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
||||
LOG_ERR("Server is built without SSL support\n");
|
||||
return 1;
|
||||
}
|
||||
svr.reset(new httplib::Server());
|
||||
#endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -227,6 +227,7 @@ struct ggml_backend_cann_context {
|
||||
* @brief Destructor for cleaning up resources.
|
||||
*/
|
||||
~ggml_backend_cann_context() {
|
||||
ggml_cann_set_device(device);
|
||||
if (copy_event != nullptr) {
|
||||
ACL_CHECK(aclrtDestroyEvent(copy_event));
|
||||
}
|
||||
|
@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum):
|
||||
NEMOTRON = auto()
|
||||
EXAONE = auto()
|
||||
GRANITE = auto()
|
||||
GRANITE_MOE = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@ -392,6 +393,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||
MODEL_ARCH.EXAONE: "exaone",
|
||||
MODEL_ARCH.GRANITE: "granite",
|
||||
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@ -1232,6 +1234,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_ARCH.GRANITE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
@ -1242,6 +1245,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GRANITE_MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
@ -251,11 +251,12 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
||||
"transformer.decoder_layer.{bid}.router", # Grok
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
||||
"transformer.decoder_layer.{bid}.router", # Grok
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
@ -364,10 +365,11 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
@ -217,6 +217,7 @@ enum llm_arch {
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -268,6 +269,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@ -1469,6 +1471,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
@ -1480,6 +1483,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@ -2398,7 +2419,7 @@ struct llama_hparams {
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
float f_logit_scale = 0.0f;
|
||||
|
||||
// Additional scale factors (Granite)
|
||||
// Additional scale factors (Granite/Granite MoE)
|
||||
float f_residual_scale = 0.0f;
|
||||
float f_embedding_scale = 0.0f;
|
||||
float f_attention_scale = 0.0f;
|
||||
@ -6050,6 +6071,7 @@ static void llm_load_hparams(
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
@ -6058,6 +6080,7 @@ static void llm_load_hparams(
|
||||
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_3B; break;
|
||||
case 40: model.type = e_model::MODEL_3B; break;
|
||||
// Add additional layer/vocab/etc checks here for other model sizes
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
@ -6812,7 +6835,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_GRANITE) {
|
||||
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
@ -6986,6 +7009,7 @@ static bool llm_load_tensors(
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
@ -15932,6 +15956,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
switch (model.arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
{
|
||||
result = llm.build_llama();
|
||||
} break;
|
||||
@ -19233,6 +19258,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
|
Loading…
Reference in New Issue
Block a user