mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 14:59:52 +00:00
examples/main README improvements and some light refactoring (#1131)
This commit is contained in:
parent
2ec83428de
commit
9b0a4d4214
@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||||||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
||||||
|
|
||||||
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
|
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
|
||||||
|
|
||||||
|
@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--embedding") {
|
} else if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
} else if (arg == "--interactive-start") {
|
|
||||||
params.interactive = true;
|
|
||||||
} else if (arg == "--interactive-first") {
|
} else if (arg == "--interactive-first") {
|
||||||
params.interactive_start = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
|
@ -43,7 +43,7 @@ struct gpt_params {
|
|||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
bool interactive_start = false; // wait for user input immediately
|
bool interactive_first = false; // wait for user input immediately
|
||||||
|
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool ignore_eos = false; // do not stop generating after eos
|
bool ignore_eos = false; // do not stop generating after eos
|
||||||
|
@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
|
|||||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
|
||||||
|
```
|
||||||
|
|
||||||
For an interactive experience, try this command:
|
For an interactive experience, try this command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
|
||||||
|
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||||
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
|
|||||||
|
|
||||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||||
|
|
||||||
|
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
|
||||||
|
|
||||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||||
|
|
||||||
## Context Management
|
## Context Management
|
||||||
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
|
|||||||
|
|
||||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||||
|
|
||||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
|
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
|
||||||
|
|
||||||
### RNG Seed
|
### RNG Seed
|
||||||
|
|
||||||
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
|
|||||||
|
|
||||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
|
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||||
|
|
||||||
Example usage: `--temp 0.8`
|
Example usage: `--temp 0.8`
|
||||||
|
|
||||||
|
@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_start = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
|||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
" - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
is_interacting = params.interactive_start;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_antiprompt = false;
|
bool is_antiprompt = false;
|
||||||
|
Loading…
Reference in New Issue
Block a user