diff --git a/.gitignore b/.gitignore index 9b6905ed4..cc4e8bb7c 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ qnt-*.txt perf-*.txt examples/jeopardy/results.txt +demo_falcon_orig.cpp +.github/workflows/build.yml +.github/workflows/build.yml diff --git a/CMakeLists.txt b/CMakeLists.txt index 19cd42dd2..2e91e8f23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,9 @@ cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason -project("llama.cpp" C CXX) - +# If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..) +project("ggllm.cpp" C CXX) +# add_definitions(-DGGML_PERF=1) +include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include") +include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) @@ -20,7 +23,7 @@ else() endif() if (EMSCRIPTEN) - set(BUILD_SHARED_LIBS_DEFAULT OFF) + set(BUILD_SHARED_LIBS_DEFAULT OFF) option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON) else() @@ -67,7 +70,7 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_CUBLAS "llama: use cuBLAS" ON) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) @@ -177,13 +180,14 @@ if (LLAMA_BLAS) endif() if (LLAMA_CUBLAS) - cmake_minimum_required(VERSION 3.17) + cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) if (CUDAToolkit_FOUND) message(STATUS "cuBLAS found") enable_language(CUDA) + message(STATUS "CUDA found, version: ${CUDAToolkit_VERSION}") set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) @@ -429,12 +433,34 @@ target_link_libraries(llama PRIVATE ${LLAMA_EXTRA_LIBS} ) +# falcon +add_library(libfalcon + libfalcon.cpp + libfalcon.h + llama-util.h + ) +target_include_directories(libfalcon PUBLIC .) +target_compile_features(libfalcon PUBLIC cxx_std_11) # don't bump +target_link_libraries(libfalcon PRIVATE + ggml + ${LLAMA_EXTRA_LIBS} + ) +# + if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) if (LLAMA_METAL) set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") endif() + + # falcon + set_target_properties(libfalcon PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(libfalcon PRIVATE LLAMA_SHARED LLAMA_BUILD) + if (LLAMA_METAL) + set_target_properties(libfalcon PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + endif() + # endif() if (GGML_SOURCES_CUDA) @@ -442,6 +468,9 @@ if (GGML_SOURCES_CUDA) set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) + # falcon + set_property(TARGET libfalcon PROPERTY CUDA_ARCHITECTURES OFF) + endif() diff --git a/Makefile b/Makefile index 9a08d610b..2a1ed8c7e 100644 --- a/Makefile +++ b/Makefile @@ -252,9 +252,15 @@ ggml.o: ggml.c ggml.h ggml-cuda.h llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ +libfalcon.o: libfalcon.cpp ggml.h ggml-cuda.h libfalcon.h llama-util.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ +falcom_common.o: examples/falcon_common.cpp examples/falcon_common.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) @@ -274,6 +280,9 @@ main: examples/main/main.cpp build-info.h ggml. quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +falcon_quantize: examples/falcon_quantize/quantize.cpp build-info.h ggml.o libfalcon.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -297,6 +306,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh rm $@.tmp; \ fi +falcon_main: examples/falcon/falcon_main.cpp build-info.h ggml.o libfalcon.o falcon_common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) # # Tests # diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de005f3e3..28d2d8cdf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -6,7 +6,7 @@ find_package(Threads REQUIRED) # ... -# common +# common set(TARGET common) @@ -23,6 +23,24 @@ target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) target_link_libraries(${TARGET} PRIVATE llama) + +# falcon_common + +set(FALCON_TARGET falcon_common) + +add_library(${FALCON_TARGET} OBJECT + falcon_common.h + falcon_common.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${FALCON_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${FALCON_TARGET} PUBLIC .) +target_compile_features(${FALCON_TARGET} PUBLIC cxx_std_11) +target_link_libraries(${FALCON_TARGET} PRIVATE libfalcon) + # examples include_directories(${CMAKE_CURRENT_SOURCE_DIR}) @@ -30,6 +48,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(main) + add_subdirectory(falcon) + add_subdirectory(falcon_quantize) add_subdirectory(quantize) add_subdirectory(quantize-stats) add_subdirectory(perplexity) diff --git a/examples/falcon/CMakeLists.txt b/examples/falcon/CMakeLists.txt new file mode 100644 index 000000000..19fca2355 --- /dev/null +++ b/examples/falcon/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET falcon_main) +add_executable(${TARGET} falcon_main.cpp) +target_link_libraries(${TARGET} PRIVATE falcon_common libfalcon ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() + diff --git a/examples/falcon/README.md b/examples/falcon/README.md new file mode 100644 index 000000000..149d507a8 --- /dev/null +++ b/examples/falcon/README.md @@ -0,0 +1,292 @@ +# llama.cpp/example/main + +This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. + +## Table of Contents + +1. [Quick Start](#quick-start) +2. [Common Options](#common-options) +3. [Input Prompts](#input-prompts) +4. [Interaction](#interaction) +5. [Context Management](#context-management) +6. [Generation Flags](#generation-flags) +7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) +8. [Additional Options](#additional-options) + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +#### Unix-based systems (Linux, macOS, etc.): + +```bash +./main -m models/7B/ggml-model.bin --prompt "Once upon a time" +``` + +#### Windows: + +```powershell +main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time" +``` + +For an interactive experience, try this command: + +#### Unix-based systems (Linux, macOS, etc.): + +```bash +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ +'User: Hi +AI: Hello. I am an AI chatbot. Would you like to talk? +User: Sure! +AI: What would you like to talk about? +User:' +``` + +#### Windows: + +```powershell +main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +``` + +The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): + +#### Unix-based systems (Linux, macOS, etc.): + +```bash +./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt +``` + +#### Windows: + +```powershell +main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt +``` + +## Common Options + +In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: + +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. +- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. +- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. + +## Input Prompts + +The `main` program provides several ways to interact with the LLaMA models using input prompts: + +- `--prompt PROMPT`: Provide a prompt directly as a command-line option. +- `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) +- `--random-prompt`: Start with a randomized prompt. + +## Interaction + +The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`. + +In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing. + +### Interaction Options + +- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. +- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. +- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions. +- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. + +By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. + +### Reverse Prompts + +Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered: + +- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space. + +To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt. + +### In-Prefix + +The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./main -r "User:" --in-prefix " " +``` + +### In-Suffix + +The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./main -r "User:" --in-prefix " " --in-suffix "Assistant:" +``` + +### Instruction Mode + +Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks: + +- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions. + +Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response). + +By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. + +## Context Management + +During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. + +### Context Size + +The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations. + +- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. + +### Keep Prompt + +The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. + +- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + +By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation. + +## Generation Flags + +The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case. + +### Number of Tokens to Predict + +- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + +The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. + +It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. + +### Temperature + +- `--temp N`: Adjust the randomness of the generated text (default: 0.8). + +Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. + +Example usage: `--temp 0.5` + +### Repeat Penalty + +- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1). +- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). +- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. + +The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1. + +The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). + +Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases. + +Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl` + +### Top-K Sampling + +- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40). + +Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40. + +Example usage: `--top-k 30` + +### Top-P Sampling + +- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + +Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9. + +Example usage: `--top-p 0.95` + +### Tail Free Sampling (TFS) + +- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). + +Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced. + +Example usage: `--tfs 2.0` + +### Locally Typical Sampling + +- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). + +Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling. + +Example usage: `--typical 0.9` + +### Mirostat Sampling + +- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). +- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1). +- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0). + +Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps). + +The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`. + +The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`. + +Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` + +### Logit Bias + +- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. + +The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated. + +For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced. + +A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.) + +Example usage: `--logit-bias 29905-inf` + +### RNG Seed + +- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). + +The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run. + +## Performance Tuning and Memory Options + +These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case. + +### Number of Threads + +- `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance. + +### Mlock + +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM. + +### No Memory Mapping + +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all. + +### Memory Float 32 + +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended. + +### Batch Size + +- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. + +### Prompt Caching + +- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. + +### Quantization + +For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). + +## Additional Options + +These options provide extra functionality and customization when running the LLaMA models: + +- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. +- `--verbose-prompt`: Print the prompt before generating text. +- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly. +- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/falcon/falcon_main.cpp b/examples/falcon/falcon_main.cpp new file mode 100644 index 000000000..efd677905 --- /dev/null +++ b/examples/falcon/falcon_main.cpp @@ -0,0 +1,665 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "falcon_common.h" +#include "libfalcon.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + +static console_state con_st; +static falcon_context ** g_ctx; + +static bool is_interacting = false; + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting=true; + } else { + console_cleanup(con_st); + printf("\n"); + falcon_print_timings(*g_ctx); + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + con_st.use_color = params.use_color; + con_st.multiline_input = params.multiline_input; + console_init(con_st); + atexit([]() { console_cleanup(con_st); }); + + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.embedding) { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } else if (params.n_ctx < 8) { + fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + + if (params.seed < 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_init_backend(); + + falcon_context * ctx; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + ctx = falcon_init_from_gpt_params(params); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), falcon_print_system_info()); + } + + // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // uncomment the "used_mem" line in llama.cpp to see the results + if (params.mem_test) { + { + const std::vector tmp(params.n_batch, falcon_token_bos()); + falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + } + + { + const std::vector tmp = { 0, }; + falcon_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + } + + falcon_print_timings(ctx); + llama_free(ctx); + + return 0; + } + + // export the cgraph and exit + if (params.export_cgraph) { + falcon_eval_export(ctx, "llama.ggml"); + llama_free(ctx); + + return 0; + } + + std::string path_session = params.path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) { + fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + + // fopen to check for existing session + FILE * fp = std::fopen(path_session.c_str(), "rb"); + if (fp != NULL) { + std::fclose(fp); + + session_tokens.resize(params.n_ctx); + size_t n_token_count_out = 0; + if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } + session_tokens.resize(n_token_count_out); + llama_set_rng_seed(ctx, params.seed); + + fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); + } else { + fprintf(stderr, "%s: session file does not exist, will create\n", __func__); + } + } + + // tokenize the prompt + std::vector embd_inp; + + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + embd_inp = ::falcon_tokenize(ctx, params.prompt, true); + } else { + embd_inp = session_tokens; + } + + const int n_ctx = falcon_n_ctx(ctx); + + if ((int) embd_inp.size() > n_ctx - 4) { + fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (session_tokens.size()) { + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { + fprintf(stderr, "%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { + fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { + fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } else { + fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + } + + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && + session_tokens.size() > embd_inp.size()) { + session_tokens.resize(embd_inp.size() - 1); + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { + params.n_keep = (int)embd_inp.size(); + } + + // prefix & suffix for instruct mode + const auto inp_pfx = ::falcon_tokenize(ctx, "\n\n### Instruction:\n\n", true); + const auto inp_sfx = ::falcon_tokenize(ctx, "\n\n### Response:\n\n", false); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + params.interactive_first = true; + params.antiprompt.push_back("### Instruction:\n\n"); + } + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + // determine newline token + auto llama_token_newline = ::falcon_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], falcon_token_to_str(ctx, embd_inp[i])); + } + if (params.n_keep > 0) { + fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + fprintf(stderr, "%s", falcon_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "'\n"); + } + fprintf(stderr, "\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(static_cast(console_ctrl_handler), true); +#endif + + fprintf(stderr, "%s: interactive mode on.\n", __func__); + + if (params.antiprompt.size()) { + for (auto antiprompt : params.antiprompt) { + fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); + } + } + + if (!params.input_prefix.empty()) { + fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); + } + + if (!params.input_suffix.empty()) { + fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str()); + } + } + fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", + params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); + fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + fprintf(stderr, "\n\n"); + + // TODO: replace with ring-buffer + std::vector last_n_tokens(n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + if (params.interactive) { + const char *control_message; + if (con_st.multiline_input) { + control_message = " - To return control to LLaMa, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to LLaMa.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + fprintf(stderr, "== Running in interactive mode. ==\n" +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + " - Press Ctrl+C to interject at any time.\n" +#endif + "%s\n", control_message); + + is_interacting = params.interactive_first; + } + + bool is_antiprompt = false; + bool input_echo = true; + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + + // the first thing we will do is to output the prompt, so set color accordingly + console_set_color(con_st, CONSOLE_COLOR_PROMPT); + + std::vector embd; + + // do one empty run to warm up the model + { + const std::vector tmp = { falcon_token_bos(), }; + falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + llama_reset_timings(ctx); + } + + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { + // predict + if (embd.size() > 0) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + auto max_embd_size = n_ctx - 4; + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int)embd.size() > max_embd_size) { + auto skipped_tokens = embd.size() - max_embd_size; + console_set_color(con_st, CONSOLE_COLOR_ERROR); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + fflush(stdout); + embd.resize(max_embd_size); + } + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() > n_ctx) { + const int n_left = n_past - params.n_keep; + + // always keep the first token - BOS + n_past = std::max(1, params.n_keep); + + // insert n_left/2 tokens at the start of embd from last_n_tokens + embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + + // stop saving session if we run out of context + path_session.clear(); + + //printf("\n---\n"); + //printf("resetting: '"); + //for (int i = 0; i < (int) embd.size(); i++) { + // printf("%s", falcon_token_to_str(ctx, embd[i])); + //} + //printf("'\n"); + //printf("\n---\n"); + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (falcon_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + n_past += n_eval; + } + + if (embd.size() > 0 && !path_session.empty()) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? falcon_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + + // optionally save the session on first sample (for faster prompt loading next time) + if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { + need_to_save_session = false; + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + llama_token id = 0; + + { + auto logits = falcon_get_logits(ctx); + auto n_vocab = falcon_n_vocab(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[falcon_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[falcon_token_nl()] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + // printf("`%d`", candidates_p.size); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } + + // replace end of text token with newline token when in interactive mode + if (id == falcon_token_eos() && params.interactive && !params.instruct) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::falcon_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + + // add it to the context + embd.push_back(id); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + } else { + // some user input remains from prompt or interaction, forward it to processing + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[n_consumed]); + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (input_echo) { + for (auto id : embd) { + printf("%s", falcon_token_to_str(ctx, id)); + } + fflush(stdout); + } + // reset color to default if we there is no pending user input + if (input_echo && (int)embd_inp.size() == n_consumed) { + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + + // check for reverse prompt + if (params.antiprompt.size()) { + std::string last_output; + for (auto id : last_n_tokens) { + last_output += falcon_token_to_str(ctx, id); + } + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (std::string & antiprompt : params.antiprompt) { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) { + if (params.interactive) { + is_interacting = true; + console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); + } + is_antiprompt = true; + fflush(stdout); + break; + } + } + } + + if (n_past > 0 && is_interacting) { + if (params.instruct) { + printf("\n> "); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { + another_line = console_readline(con_st, line); + buffer += line; + } while (another_line); + + // done taking input, reset color + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + // append input suffix if any + if (!params.input_suffix.empty()) { + buffer += params.input_suffix; + printf("%s", params.input_suffix.c_str()); + } + + // instruct mode: insert instruction prefix + if (params.instruct && !is_antiprompt) { + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + } + + auto line_inp = ::falcon_tokenize(ctx, buffer, false); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + // instruct mode: insert response suffix + if (params.instruct) { + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + + n_remain -= line_inp.size(); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) { + is_interacting = false; + } + } + + // end of text token + if (!embd.empty() && embd.back() == falcon_token_eos()) { + if (params.instruct) { + is_interacting = true; + } else { + fprintf(stderr, " [end of text]\n"); + break; + } + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + if (params.interactive && n_remain <= 0 && params.n_predict != -1) { + n_remain = params.n_predict; + is_interacting = true; + } + } + + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + falcon_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/falcon_common.cpp b/examples/falcon_common.cpp new file mode 100644 index 000000000..9aa6d2942 --- /dev/null +++ b/examples/falcon_common.cpp @@ -0,0 +1,930 @@ +#include "falcon_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__APPLE__) && defined(__MACH__) +#include +#include +#endif + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#include +#else +#include +#include +#include +#endif + +int32_t get_num_physical_cores() { +#ifdef __linux__ + // enumerate the set of thread siblings, num entries is num cores + std::unordered_set siblings; + for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { + std::ifstream thread_siblings("/sys/devices/system/cpu" + + std::to_string(cpu) + "/topology/thread_siblings"); + if (!thread_siblings.is_open()) { + break; // no more cpus + } + std::string line; + if (std::getline(thread_siblings, line)) { + siblings.insert(line); + } + } + if (siblings.size() > 0) { + return static_cast(siblings.size()); + } +#elif defined(__APPLE__) && defined(__MACH__) + int32_t num_physical_cores; + size_t len = sizeof(num_physical_cores); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } +#elif defined(_WIN32) + //TODO: Implement +#endif + unsigned int n_threads = std::thread::hardware_concurrency(); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +} + +void process_escapes(std::string& input) { + std::size_t input_len = input.length(); + std::size_t output_idx = 0; + + for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { + if (input[input_idx] == '\\' && input_idx + 1 < input_len) { + switch (input[++input_idx]) { + case 'n': input[output_idx++] = '\n'; break; + case 'r': input[output_idx++] = '\r'; break; + case 't': input[output_idx++] = '\t'; break; + case '\'': input[output_idx++] = '\''; break; + case '\"': input[output_idx++] = '\"'; break; + case '\\': input[output_idx++] = '\\'; break; + default: input[output_idx++] = '\\'; + input[output_idx++] = input[input_idx]; break; + } + } else { + input[output_idx++] = input[input_idx]; + } + } + + input.resize(output_idx); +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + bool escape_prompt = false; + std::string arg; + gpt_params default_params; + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "-s" || arg == "--seed") { +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); +#endif + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoi(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; + } else if (arg == "-e") { + escape_prompt = true; + } else if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.path_prompt_cache = argv[i]; + } else if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + } else if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + } else if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory-f32") { + params.memory_f16 = false; + } else if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); + } else if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); + } else if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.tfs_z = std::stof(argv[i]); + } else if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.typical_p = std::stof(argv[i]); + } else if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); + } else if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); + } else if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.frequency_penalty = std::stof(argv[i]); + } else if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.presence_penalty = std::stof(argv[i]); + } else if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat = std::stoi(argv[i]); + } else if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_eta = std::stof(argv[i]); + } else if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_tau = std::stof(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_keep = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "--interactive-first") { + params.interactive_first = true; + } else if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + } else if (arg == "--multiline-input") { + params.multiline_input = true; + } else if (arg == "--color") { + params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif + } else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); +#endif + } else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else if (arg == "--mtest") { + params.mem_test = true; + } else if (arg == "--export") { + params.export_cgraph = true; + } else if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + } else if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); + } else if (arg == "--perplexity") { + params.perplexity = true; + } else if (arg == "--ignore-eos") { + params.logit_bias[falcon_token_eos()] = -INFINITY; + } else if (arg == "--no-penalize-nl") { + params.penalize_nl = false; + } else if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } else { + throw std::exception(); + } + } catch (const std::exception &e) { + invalid_param = true; + break; + } + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, default_params); + exit(0); + } else if (arg == "--random-prompt") { + params.random_prompt = true; + } else if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_prefix = argv[i]; + } else if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_suffix = argv[i]; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + if (params.prompt_cache_all && + (params.interactive || params.interactive_first || + params.instruct)) { + fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + if (escape_prompt) { + process_escapes(params.prompt); + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -i, --interactive run in interactive mode\n"); + fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n"); + fprintf(stderr, " (can be specified more than once for multiple prompts).\n"); + fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: empty)\n"); + fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + fprintf(stderr, " not supported with --interactive or other interactive options\n"); + fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); + fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " prompt file to start generation.\n"); + fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); + fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); + fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + if (llama_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap Do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); +#endif + fprintf(stderr, " --mtest compute maximum memory usage\n"); + fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); + fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +// TODO: not great allocating this every time +std::vector falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos) { + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector res(text.size() + (int) add_bos); + const int n = falcon_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); + res.resize(n); + + return res; +} + +struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) { + auto lparams = falcon_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_batch = params.n_batch; + lparams.n_gpu_layers = params.n_gpu_layers; + lparams.main_gpu = params.main_gpu; + memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + lparams.logits_all = params.perplexity; + lparams.embedding = params.embedding; + + falcon_context * lctx = falcon_init_from_file(params.model.c_str(), lparams); + + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return NULL; + } + + // if (!params.lora_adapter.empty()) { + // int err = llama_apply_lora_from_file(lctx, + // params.lora_adapter.c_str(), + // params.lora_base.empty() ? NULL : params.lora_base.c_str(), + // params.n_threads); + // if (err != 0) { + // fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // return NULL; + // } + // } + + return lctx; +} + +void console_init(console_state & con_st) { +#if defined(_WIN32) + // Windows-specific console initialization + DWORD dwMode = 0; + con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) { + con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE); + if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) { + con_st.hConsole = NULL; + } + } + if (con_st.hConsole) { + // Enable ANSI colors on Windows 10+ + if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) { + SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); + } + // Set console output codepage to UTF8 + SetConsoleOutputCP(CP_UTF8); + } + HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE); + if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) { + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); + + // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + SetConsoleMode(hConIn, dwMode); + } +#else + // POSIX-specific console initialization + struct termios new_termios; + tcgetattr(STDIN_FILENO, &con_st.prev_state); + new_termios = con_st.prev_state; + new_termios.c_lflag &= ~(ICANON | ECHO); + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + tcsetattr(STDIN_FILENO, TCSANOW, &new_termios); + + con_st.tty = fopen("/dev/tty", "w+"); + if (con_st.tty != nullptr) { + con_st.out = con_st.tty; + } + + setlocale(LC_ALL, ""); +#endif +} + +void console_cleanup(console_state & con_st) { + // Reset console color + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + +#if !defined(_WIN32) + if (con_st.tty != nullptr) { + con_st.out = stdout; + fclose(con_st.tty); + con_st.tty = nullptr; + } + // Restore the terminal settings on POSIX systems + tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state); +#endif +} + +/* Keep track of current color of output, and emit ANSI code if it changes. */ +void console_set_color(console_state & con_st, console_color_t color) { + if (con_st.use_color && con_st.color != color) { + fflush(stdout); + switch(color) { + case CONSOLE_COLOR_DEFAULT: + fprintf(con_st.out, ANSI_COLOR_RESET); + break; + case CONSOLE_COLOR_PROMPT: + fprintf(con_st.out, ANSI_COLOR_YELLOW); + break; + case CONSOLE_COLOR_USER_INPUT: + fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN); + break; + case CONSOLE_COLOR_ERROR: + fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED); + break; + } + con_st.color = color; + fflush(con_st.out); + } +} + +char32_t getchar32() { +#if defined(_WIN32) + HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); + wchar_t high_surrogate = 0; + + while (true) { + INPUT_RECORD record; + DWORD count; + if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { + return WEOF; + } + + if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { + wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; + if (wc == 0) { + continue; + } + + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + high_surrogate = wc; + continue; + } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate + if (high_surrogate != 0) { // Check if we have a high surrogate + return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; + } + } + + high_surrogate = 0; // Reset the high surrogate + return static_cast(wc); + } + } +#else + wchar_t wc = getwchar(); + if (static_cast(wc) == WEOF) { + return WEOF; + } + +#if WCHAR_MAX == 0xFFFF + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + wchar_t low_surrogate = getwchar(); + if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate + return (static_cast(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000; + } + } + if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair + return 0xFFFD; // Return the replacement character U+FFFD + } +#endif + + return static_cast(wc); +#endif +} + +void pop_cursor(console_state & con_st) { +#if defined(_WIN32) + if (con_st.hConsole != NULL) { + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo); + + COORD newCursorPosition = bufferInfo.dwCursorPosition; + if (newCursorPosition.X == 0) { + newCursorPosition.X = bufferInfo.dwSize.X - 1; + newCursorPosition.Y -= 1; + } else { + newCursorPosition.X -= 1; + } + + SetConsoleCursorPosition(con_st.hConsole, newCursorPosition); + return; + } +#endif + putc('\b', con_st.out); +} + +int estimateWidth(char32_t codepoint) { +#if defined(_WIN32) + return 1; +#else + return wcwidth(codepoint); +#endif +} + +int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) { + // go with the default + return expectedWidth; + } + COORD initialPosition = bufferInfo.dwCursorPosition; + DWORD nNumberOfChars = length; + WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); + + CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; + GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo); + + // Figure out our real position if we're in the last column + if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) { + DWORD nNumberOfChars; + WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL); + GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo); + } + + int width = newBufferInfo.dwCursorPosition.X - initialPosition.X; + if (width < 0) { + width += newBufferInfo.dwSize.X; + } + return width; +#else + // we can trust expectedWidth if we've got one + if (expectedWidth >= 0 || con_st.tty == nullptr) { + fwrite(utf8_codepoint, length, 1, con_st.out); + return expectedWidth; + } + + fputs("\033[6n", con_st.tty); // Query cursor position + int x1, x2, y1, y2; + int results = 0; + results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1); + + fwrite(utf8_codepoint, length, 1, con_st.tty); + + fputs("\033[6n", con_st.tty); // Query cursor position + results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2); + + if (results != 4) { + return expectedWidth; + } + + int width = x2 - x1; + if (width < 0) { + // Calculate the width considering text wrapping + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + width += w.ws_col; + } + return width; +#endif +} + +void replace_last(console_state & con_st, char ch) { +#if defined(_WIN32) + pop_cursor(con_st); + put_codepoint(con_st, &ch, 1, 1); +#else + fprintf(con_st.out, "\b%c", ch); +#endif +} + +void append_utf8(char32_t ch, std::string & out) { + if (ch <= 0x7F) { + out.push_back(static_cast(ch)); + } else if (ch <= 0x7FF) { + out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0xFFFF) { + out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); + out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else { + // Invalid Unicode code point + } +} + +// Helper function to remove the last UTF-8 character from a string +void pop_back_utf8_char(std::string & line) { + if (line.empty()) { + return; + } + + size_t pos = line.length() - 1; + + // Find the start of the last UTF-8 character (checking up to 4 bytes back) + for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) { + if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character + } + line.erase(pos); +} + +bool console_readline(console_state & con_st, std::string & line) { + console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); + if (con_st.out != stdout) { + fflush(stdout); + } + + line.clear(); + std::vector widths; + bool is_special_char = false; + bool end_of_stream = false; + + char32_t input_char; + while (true) { + fflush(con_st.out); // Ensure all output is displayed before waiting for input + input_char = getchar32(); + + if (input_char == '\r' || input_char == '\n') { + break; + } + + if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) { + end_of_stream = true; + break; + } + + if (is_special_char) { + console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); + replace_last(con_st, line.back()); + is_special_char = false; + } + + if (input_char == '\033') { // Escape sequence + char32_t code = getchar32(); + if (code == '[' || code == 0x1B) { + // Discard the rest of the escape sequence + while ((code = getchar32()) != (char32_t) WEOF) { + if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') { + break; + } + } + } + } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace + if (!widths.empty()) { + int count; + do { + count = widths.back(); + widths.pop_back(); + // Move cursor back, print space, and move cursor back again + for (int i = 0; i < count; i++) { + replace_last(con_st, ' '); + pop_cursor(con_st); + } + pop_back_utf8_char(line); + } while (count == 0 && !widths.empty()); + } + } else { + int offset = line.length(); + append_utf8(input_char, line); + int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); + if (width < 0) { + width = 0; + } + widths.push_back(width); + } + + if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { + console_set_color(con_st, CONSOLE_COLOR_PROMPT); + replace_last(con_st, line.back()); + is_special_char = true; + } + } + + bool has_more = con_st.multiline_input; + if (is_special_char) { + replace_last(con_st, ' '); + pop_cursor(con_st); + + char last = line.back(); + line.pop_back(); + if (last == '\\') { + line += '\n'; + fputc('\n', con_st.out); + has_more = !has_more; + } else { + // llama will just eat the single space, it won't act as a space + if (line.length() == 1 && line.back() == ' ') { + line.clear(); + pop_cursor(con_st); + } + has_more = false; + } + } else { + if (end_of_stream) { + has_more = false; + } else { + line += '\n'; + fputc('\n', con_st.out); + } + } + + fflush(con_st.out); + return has_more; +} diff --git a/examples/falcon_common.h b/examples/falcon_common.h new file mode 100644 index 000000000..c55c7897b --- /dev/null +++ b/examples/falcon_common.h @@ -0,0 +1,136 @@ +// Various helper functions and utilities + +#pragma once + +#include "libfalcon.h" + +#include +#include +#include +#include +#include + +#if !defined (_WIN32) +#include +#include +#endif + +// +// CLI argument parsing +// +int32_t get_num_physical_cores(); + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + + // sampling parameters + std::unordered_map logit_bias; // logit bias for specific tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + float repeat_penalty = 1.10f; // 1.0 = disabled + int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float frequency_penalty = 0.00f; // 0.0 = disabled + float presence_penalty = 0.00f; // 0.0 = disabled + int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + + std::string model = "models/7B/ggml-model.bin"; // model path + std::string model_alias = "unknown"; // model alias + std::string prompt = ""; + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with + std::vector antiprompt; // string upon seeing which more user input is prompted + + std::string lora_adapter = ""; // lora adapter path + std::string lora_base = ""; // base model path for the lora adapter + + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + bool prompt_cache_all = false; // save user input and generations to prompt cache + bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it + + bool embedding = false; // get only sentence embedding + bool interactive_first = false; // wait for user input immediately + bool multiline_input = false; // reverse the usage of `\` + + bool instruct = false; // instruction mode (used for Alpaca models) + bool penalize_nl = true; // consider newlines as a repeatable token + bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads + bool use_mlock = false; // use mlock to keep model in memory + bool mem_test = false; // compute maximum memory usage + bool export_cgraph = false; // export the computation graph + bool verbose_prompt = false; // print prompt tokens before generation +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Vocab utils +// + +std::vector falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos); + +// +// Model utils +// + +struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params); + +// +// Console utils +// + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +enum console_color_t { + CONSOLE_COLOR_DEFAULT=0, + CONSOLE_COLOR_PROMPT, + CONSOLE_COLOR_USER_INPUT, + CONSOLE_COLOR_ERROR +}; + +struct console_state { + bool multiline_input = false; + bool use_color = false; + console_color_t color = CONSOLE_COLOR_DEFAULT; + + FILE* out = stdout; +#if defined (_WIN32) + void* hConsole; +#else + FILE* tty = nullptr; + termios prev_state; +#endif +}; + +void console_init(console_state & con_st); +void console_cleanup(console_state & con_st); +void console_set_color(console_state & con_st, console_color_t color); +bool console_readline(console_state & con_st, std::string & line); diff --git a/examples/falcon_quantize/CMakeLists.txt b/examples/falcon_quantize/CMakeLists.txt new file mode 100644 index 000000000..32f3104ef --- /dev/null +++ b/examples/falcon_quantize/CMakeLists.txt @@ -0,0 +1,7 @@ +set(TARGET falcon_quantize) +add_executable(${TARGET} quantize.cpp) +target_link_libraries(${TARGET} PRIVATE libfalcon ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/falcon_quantize/README.md b/examples/falcon_quantize/README.md new file mode 100644 index 000000000..f349e913e --- /dev/null +++ b/examples/falcon_quantize/README.md @@ -0,0 +1,3 @@ +# quantize + +TODO diff --git a/examples/falcon_quantize/quantize.cpp b/examples/falcon_quantize/quantize.cpp new file mode 100644 index 000000000..3af7f3ffe --- /dev/null +++ b/examples/falcon_quantize/quantize.cpp @@ -0,0 +1,261 @@ +#include "build-info.h" + +#include "libfalcon.h" + +#include +#include +#include +#include + +struct quant_option { + std::string name; + llama_ftype ftype; + std::string desc; +}; + +static const std::vector QUANT_OPTIONS = { + { + "Q4_0", + LLAMA_FTYPE_MOSTLY_Q4_0, + " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", + }, + { + "Q4_1", + LLAMA_FTYPE_MOSTLY_Q4_1, + " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L", + }, + { + "Q5_0", + LLAMA_FTYPE_MOSTLY_Q5_0, + " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M", + }, + { + "Q5_1", + LLAMA_FTYPE_MOSTLY_Q5_1, + " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M", + }, +#ifdef GGML_USE_K_QUANTS + { + "Q2_K", + LLAMA_FTYPE_MOSTLY_Q2_K, + " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", + }, + { + "Q3_K", + LLAMA_FTYPE_MOSTLY_Q3_K_M, + "alias for Q3_K_M" + }, + { + "Q3_K_S", + LLAMA_FTYPE_MOSTLY_Q3_K_S, + " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss", + }, + { + "Q3_K_M", + LLAMA_FTYPE_MOSTLY_Q3_K_M, + " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss", + }, + { + "Q3_K_L", + LLAMA_FTYPE_MOSTLY_Q3_K_L, + " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss", + }, + { + "Q4_K", + LLAMA_FTYPE_MOSTLY_Q4_K_M, + "alias for Q4_K_M", + }, + { + "Q4_K_S", + LLAMA_FTYPE_MOSTLY_Q4_K_S, + " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss", + }, + { + "Q4_K_M", + LLAMA_FTYPE_MOSTLY_Q4_K_M, + " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*", + }, + { + "Q5_K", + LLAMA_FTYPE_MOSTLY_Q5_K_M, + "alias for Q5_K_M", + }, + { + "Q5_K_S", + LLAMA_FTYPE_MOSTLY_Q5_K_S, + " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*", + }, + { + "Q5_K_M", + LLAMA_FTYPE_MOSTLY_Q5_K_M, + " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*", + }, + { + "Q6_K", + LLAMA_FTYPE_MOSTLY_Q6_K, + " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss", + }, +#endif + { + "Q8_0", + LLAMA_FTYPE_MOSTLY_Q8_0, + " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended", + }, + { + "F16", + LLAMA_FTYPE_MOSTLY_F16, + "13.00G @ 7B - extremely large, virtually no quality loss - not recommended", + }, + { + "F32", + LLAMA_FTYPE_ALL_F32, + "26.00G @ 7B - absolutely huge, lossless - not recommended", + }, +}; + + +bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { + std::string ftype_str; + + for (auto ch : ftype_str_in) { + ftype_str.push_back(std::toupper(ch)); + } + for (auto & it : QUANT_OPTIONS) { + if (it.name == ftype_str) { + ftype = it.ftype; + ftype_str_out = it.name; + return true; + } + } + try { + int ftype_int = std::stoi(ftype_str); + for (auto & it : QUANT_OPTIONS) { + if (it.ftype == ftype_int) { + ftype = it.ftype; + ftype_str_out = it.name; + return true; + } + } + } + catch (...) { + // stoi failed + } + return false; +} + +// usage: +// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// +void usage(const char * executable) { + fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); + fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + fprintf(stderr, "\nAllowed quantization types:\n"); + for (auto & it : QUANT_OPTIONS) { + printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + } + exit(1); +} + +int main(int argc, char ** argv) { + if (argc < 3) { + usage(argv[0]); + } + + llama_model_quantize_params params = llama_model_quantize_default_params(); + + int arg_idx = 1; + + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { + params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { + params.allow_requantize = true; + } else { + usage(argv[0]); + } + } + + if (argc - arg_idx < 3) { + usage(argv[0]); + } + + llama_init_backend(); + + // parse command line arguments + const std::string fname_inp = argv[arg_idx]; + arg_idx++; + std::string fname_out; + + std::string ftype_str; + if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { + std::string fpath; + const size_t pos = fname_inp.find_last_of('/'); + if (pos != std::string::npos) { + fpath = fname_inp.substr(0, pos + 1); + } + // export as [inp path]/ggml-model-[ftype].bin + fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; + arg_idx++; + } + else { + fname_out = argv[arg_idx]; + arg_idx++; + + if (argc <= arg_idx) { + fprintf(stderr, "%s: missing ftype\n", __func__); + return 1; + } + if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { + fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); + return 1; + } + arg_idx++; + } + + // parse nthreads + if (argc > arg_idx) { + try { + params.nthread = std::stoi(argv[arg_idx]); + } + catch (const std::exception & e) { + fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); + return 1; + } + } + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + + fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); + if (params.nthread > 0) { + fprintf(stderr, " using %d threads", params.nthread); + } + fprintf(stderr, "\n"); + + const int64_t t_main_start_us = llama_time_us(); + + int64_t t_quantize_us = 0; + + // load the model + { + const int64_t t_start_us = llama_time_us(); + + if (falcon_model_quantize(fname_inp.c_str(), fname_out.c_str(), ¶ms)) { + fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + + t_quantize_us = llama_time_us() - t_start_us; + } + + // report timing + { + const int64_t t_main_end_us = llama_time_us(); + + printf("\n"); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/falcon_convert_demo.py b/falcon_convert_demo.py new file mode 100644 index 000000000..07f2a186b --- /dev/null +++ b/falcon_convert_demo.py @@ -0,0 +1,144 @@ +# Based on: https://github.com/KerfuffleV2/ggml-falcon/blob/feat-improve-falcon-convert-hf/examples/falcon/convert-hf-to-ggml.py +# Convert Hugging Face fine-tuned bloom-like models to ggml format +# +# Usage: +# +# python3 models/convert-h5-to-ggml.py +# +# This script is similar to "convert-pt-to-ggml.py" +# + +import io +import os +import sys +import struct +import json +import code +import torch +import numpy as np + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +if len(sys.argv) < 4: + print("Usage: python convert-hf-to-ggml.py num_parts model_name dir-output [use-f32]") + print(" num_parts: number of pytorch parts, use 0 if not a multipart model. example: 9") + print(" model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'") + print(" dir-output: directory where the output file will be written") + print(" use-f32: if present, use float32 instead of float16") + sys.exit(1) + +num_parts = int(sys.argv[1]) +model_name = sys.argv[2] +dir_out = sys.argv[3] + +# make sure the output directory exists +os.makedirs(dir_out, exist_ok=True) + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] +ftype = 1 +if len(sys.argv) > 4: + ftype = 0 + +tokenizer = AutoTokenizer.from_pretrained(model_name) +config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) +hparams = config.to_dict() + +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 +head_dim = hparams["hidden_size"] // n_head +print("* Loading model from: ", model_name) + +fname_out = dir_out + f"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin" +fout = open(fname_out, "wb") +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["vocab_size"])) +fout.write(struct.pack("i", hparams["hidden_size"])) +fout.write(struct.pack("i", n_head)) +fout.write(struct.pack("i", n_head_kv)) +fout.write(struct.pack("i", hparams["n_layer"])) +fout.write(struct.pack("i", 40 if "n_head_kv" in hparams else 7)) +fout.write(struct.pack("i", ftype)) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +for i in range(hparams["vocab_size"]): + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +if num_parts == 0: + partnames= ('pytorch_model.bin',) +else: + partnames = (f'pytorch_model-{n:05}-of-{num_parts:05}.bin' for n in range(1, num_parts + 1)) +for partname in partnames: + filename = f'{model_name}/{partname}' + print(f'\n* Loading part: {partname}') + model = torch.load(filename, map_location = 'cpu') + for name in model.keys(): + src = name + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + + if "query_key_value" in src: + qkv = model[src].view( + n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + + q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + + model[src] = torch.cat((q,k,v)).reshape_as(model[src]) + data = model[src].squeeze() + n_dims = len(data.shape) + # default type is fp32 + ftype_cur = 1 if ftype == 1 and n_dims > 1 else 0 + data = data.to(dtype = torch.float16 if ftype_cur == 1 else torch.float32).numpy() + print(f' |', name, data.shape, '->', data.dtype) + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + fout.write(str) + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("") \ No newline at end of file diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3b9a5ddfb..b9c9ad258 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1663,6 +1663,7 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (src0->backend == GGML_BACKEND_GPU) printf("src0->backend != GGML_BACKEND_GPU (%s)\n",src0->name); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU); const int64_t ne10 = src1->ne[0]; diff --git a/ggml.c b/ggml.c index 32c191307..5e832b474 100644 --- a/ggml.c +++ b/ggml.c @@ -3650,9 +3650,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", + "REPEAT2", + "REPEAT2_BACK", }; -static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); +static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 57"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3718,9 +3720,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", + "repeat2(x)", + "repeat2_back(x)", }; -static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); +static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -5209,6 +5213,62 @@ struct ggml_tensor * ggml_repeat_back( return result; } +// ggml_repeat2 + +struct ggml_tensor * ggml_repeat2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(a, b)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_repeat2_back + +struct ggml_tensor * ggml_repeat2_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(b, a)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT2_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_abs struct ggml_tensor * ggml_abs_impl( @@ -9180,6 +9240,182 @@ static void ggml_compute_forward_repeat_back( } } +// ggml_compute_forward_repeat2 + +static void ggml_compute_forward_repeat2_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + int i2k2 = 0; + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++, i2k2 = 0) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++, i2k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_cpy_f32(ne00, + (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) src0->data + ( k3)*nb03 + (i2k2 / nr2)*nb02 + ( k1)*nb01)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat2( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat2_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_repeat2_back_f32 + +static void ggml_compute_forward_repeat2_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + int k2i2 = 0; + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++, k2i2 = 0) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++, k2i2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (k2*i2 % nr2)*nb02 + ( k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat2_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat2_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_abs static void ggml_compute_forward_abs_f32( @@ -14303,6 +14539,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } + if (tensor->src0->backend != GGML_BACKEND_CPU) + printf("%s src0->backend != GGML_BACKEND_CPU (%s; %s)\n", ggml_op_name (tensor->op),tensor->name,tensor->src0->src0->name); GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS @@ -14368,6 +14606,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_repeat_back(params, tensor->src0, tensor); } break; + case GGML_OP_REPEAT2: + { + ggml_compute_forward_repeat2(params, tensor->src0, tensor); + } break; + case GGML_OP_REPEAT2_BACK: + { + ggml_compute_forward_repeat2_back(params, tensor->src0, tensor); + } break; case GGML_OP_ABS: { ggml_compute_forward_abs(params, tensor->src0, tensor); @@ -14738,6 +14984,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } } break; + case GGML_OP_REPEAT2: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat2_back(ctx, tensor->grad, src0->grad), + inplace); + } + } break; case GGML_OP_REPEAT_BACK: { if (src0->grad) { @@ -14748,6 +15004,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } } break; + case GGML_OP_REPEAT2_BACK: + { + if (src0->grad) { + // TODO: test this + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat2(ctx, tensor->grad, src0->grad), + inplace); + } + } break; case GGML_OP_ABS: { if (src0->grad) { @@ -15726,6 +15992,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MEAN: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: + case GGML_OP_REPEAT2: + case GGML_OP_REPEAT2_BACK: case GGML_OP_ABS: case GGML_OP_SGN: case GGML_OP_NEG: diff --git a/ggml.h b/ggml.h index f2a91761b..929f43913 100644 --- a/ggml.h +++ b/ggml.h @@ -198,7 +198,7 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 -#define GGML_MAX_NAME 32 +#define GGML_MAX_NAME 64 #define GGML_DEFAULT_N_THREADS 4 #define GGML_ASSERT(x) \ @@ -344,6 +344,9 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS_BACK, + // https://github.com/jploski/ggml/commit/3352043d851fbc84a46e251c3281d24bd18efeb2 + GGML_OP_REPEAT2, + GGML_OP_REPEAT2_BACK, // untested, probably not working GGML_OP_COUNT, }; @@ -662,6 +665,16 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_repeat2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_repeat2_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_abs( struct ggml_context * ctx, struct ggml_tensor * a); diff --git a/libfalcon.cpp b/libfalcon.cpp new file mode 100644 index 000000000..227c09076 --- /dev/null +++ b/libfalcon.cpp @@ -0,0 +1,3454 @@ +// Defines fileno on msys: +// inference&model based on the ggml falcon example PR from https://github.com/KerfuffleV2/ggml-falcon +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#endif + +#include "llama-util.h" +#include "libfalcon.h" + +#include "ggml.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#elif defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LLAMA_USE_SCRATCH +#define LLAMA_MAX_SCRATCH_BUFFERS 16 + +// available falcon models +enum e_model { + FALCON_UNKNOWN, + FALCON_7B, + FALCON_40B, +}; + +static const size_t MB = 1024*1024; + +// computed for n_ctx == 2048 +// TODO: dynamically determine these sizes +// needs modifications in ggml + +typedef void (*offload_func_t)(struct ggml_tensor * tensor); + +void llama_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} + +static const std::map & MEM_REQ_SCRATCH0() +{ + static std::map k_sizes = { + { FALCON_7B, 512ull * MB }, + { FALCON_40B, 1024ull * MB }, + }; + return k_sizes; +} + +static const std::map & MEM_REQ_SCRATCH1() +{ + static std::map k_sizes = { + { FALCON_7B, 512ull * MB }, + { FALCON_40B, 1024ull * MB }, + }; + return k_sizes; +} + +// 2*n_embd*n_ctx*n_layer*sizeof(float16) +static const std::map & MEM_REQ_KV_SELF() +{ + static std::map k_sizes = { + { FALCON_7B, 1026ull * MB }, + { FALCON_40B, 5120ull * MB }, + }; + return k_sizes; +} + +// this is mostly needed for temporary mul_mat buffers to dequantize the data +// not actually needed if BLAS is disabled +static const std::map & MEM_REQ_EVAL() +{ + static std::map k_sizes = { + { FALCON_7B, 768ull * MB }, + { FALCON_40B, 1536ull * MB }, + }; + return k_sizes; +} + +// default hparams (Falcon 7B) +struct falcon_hparams { + int32_t n_vocab = 65024; + int32_t n_ctx = 2048; + int32_t n_embd = 4544; + int32_t n_head = 71; + int32_t n_head_kv = 1; + int32_t n_layer = 32; + int32_t version = 7; // 7 for Falcon-7B, 40 for Falcon-40B + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; + + bool operator!=(const falcon_hparams & other) const { + return static_cast(memcmp(this, &other, sizeof(falcon_hparams))); + } +}; + +struct falcon_layer { + // normalization + struct ggml_tensor* input_layernorm; + struct ggml_tensor* input_layernorm_b; + struct ggml_tensor* attention_norm; // Falcon-40B only + struct ggml_tensor* attention_norm_b; // Falcon-40B only + + // attention + struct ggml_tensor* query_key_value; + struct ggml_tensor* wo; + + // ff + struct ggml_tensor* ffn_up; + struct ggml_tensor* ffn_down; +}; + +struct falcon_kv_cache { + struct ggml_tensor * k; + struct ggml_tensor * v; + + struct ggml_context * ctx = NULL; + + llama_ctx_buffer buf; + + int n; // number of tokens currently in the cache + + ~falcon_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } +}; + +struct falcon_model { + e_model type = FALCON_UNKNOWN; + + falcon_hparams hparams; + + struct ggml_tensor* tok_embeddings; + struct ggml_tensor* output_norm; + struct ggml_tensor* output_norm_b; + struct ggml_tensor* lm_head; + // struct ggml_tensor* output; + + std::vector layers; + + // key + value memory + struct ggml_tensor* memory_k; + struct ggml_tensor* memory_v; + int n_gpu_layers; + + // context + struct ggml_context * ctx = NULL; + std::map tensors; + + // key + value cache for the self attention + // TODO: move to llama_state + struct falcon_kv_cache kv_self; + + // the model memory buffer + llama_ctx_buffer buf; + + // model memory mapped file + std::unique_ptr mapping; + + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + + ~falcon_model() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cuda_free_data(tensors_by_name[i].second); + } +#elif defined(GGML_USE_CLBLAST) + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cl_free_data(tensors_by_name[i].second); + } +#endif + } +}; + +struct falcon_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + +struct falcon_context { + std::mt19937 rng; + + int64_t t_load_us = 0; + int64_t t_start_us = 0; + bool has_evaluated_once = false; + + int64_t t_sample_us = 0; + int64_t t_eval_us = 0; + int64_t t_p_eval_us = 0; + + int32_t n_sample = 0; // number of tokens sampled + int32_t n_eval = 0; // number of eval calls + int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + + falcon_model model; + falcon_vocab vocab; + + size_t mem_per_token = 0; + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + std::vector logits; + bool logits_all = false; + + // input embedding (1-dimensional array: [n_embd]) + std::vector embedding; + + // memory buffers used to evaluate the model + // TODO: move in llama_state + llama_ctx_buffer buf_compute; + llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + +#ifdef GGML_USE_METAL + ggml_metal_context * ctx_metal = NULL; +#endif + + int buf_last = 0; + size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; + + void use_buf(struct ggml_context * ctx, int i) { +#if defined(LLAMA_USE_SCRATCH) + size_t last_size = 0; + + if (i == -1) { + last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); + } else { + auto & buf = buf_scratch[i]; + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); + } + + if (buf_last >= 0) { + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); + } + + buf_last = i; +#else + (void) i; + (void) ctx; +#endif + } + + size_t get_buf_max_mem(int i) const { +#if defined(LLAMA_USE_SCRATCH) + return buf_max_size[i]; +#else + (void) i; + return 0; +#endif + } +}; + +template +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw std::runtime_error(format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b)); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw std::runtime_error(format("error dividing %zu / %zu", a, b)); + } + return a / b; +} + +static std::string llama_format_tensor_shape(const std::vector & ne) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5u", ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); + } + return buf; +} + +static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_load_tensor_shard { + std::vector ne; + size_t size; + enum ggml_type type; + size_t file_idx; + size_t file_off; + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +enum llama_split_type { + SPLIT_NONE, + SPLIT_BY_COLUMNS, + SPLIT_BY_ROWS +}; + +struct falcon_load_tensor { + std::vector shards; + + std::string name; + enum ggml_type type = GGML_TYPE_F32; + llama_split_type split_type = SPLIT_NONE; + std::vector ne; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; + + falcon_load_tensor(const std::string & name) : name(name) {} + + void calc_all() { + calc_type(); + calc_split_type(); + calc_ne(); + calc_size(); + } + + void calc_type() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.type != first_shard.type) { + throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str())); + } + } + type = first_shard.type; + } + + void calc_split_type() { + if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file + shards.size() == 1) { // only one file? + split_type = SPLIT_NONE; + } else if (name.find("tok_embeddings.") == 0 || + name.find(".attention.wo.weight") != std::string::npos || + name.find(".feed_forward.w2.weight") != std::string::npos) { + split_type = SPLIT_BY_COLUMNS; + } else { + split_type = SPLIT_BY_ROWS; + } + } + + void calc_ne() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.ne != first_shard.ne) { + throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str())); + } + } + ne = first_shard.ne; + LLAMA_ASSERT(shards.size() <= UINT32_MAX); + uint32_t n_shards = (uint32_t) shards.size(); + switch (split_type) { + case SPLIT_NONE: + ne = first_shard.ne; + break; + case SPLIT_BY_COLUMNS: + ne = {checked_mul(first_shard.ne[0], n_shards), + first_shard.ne[1]}; + break; + case SPLIT_BY_ROWS: + ne = {first_shard.ne[0], + checked_mul(first_shard.ne[1], n_shards)}; + break; + } + } + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +struct llama_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector tensors; + std::unordered_map name_to_idx; +}; + +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, // ftype incompatible when using the current falcon converter, remainder is ggml format + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding + LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format +}; + +struct falcon_file_loader { + llama_file file; + llama_file_version file_version; + falcon_hparams hparams; + falcon_vocab vocab; + + falcon_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + : file(fname, "rb") { + fprintf(stderr, "falcon.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(file_idx, tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + + if (magic == LLAMA_FILE_MAGIC_GGML) { + file_version = LLAMA_FILE_VERSION_GGML; + return; + } + + uint32_t version = file.read_u32(); + + switch (magic) { + case LLAMA_FILE_MAGIC_GGMF: + switch (version) { + case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return; + } + break; + case LLAMA_FILE_MAGIC_GGJT: + switch (version) { + case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return; + case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return; + case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return; + } + } + + throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version)); + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_head_kv = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.version = file.read_u32(); + // outdated ftype handling for ggml version 1 (TODO: upgrade and support both variants for compatibility) + if (file_version == LLAMA_FILE_VERSION_GGML) + { + int32_t ftype = file.read_u32(); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + hparams.ftype = (enum llama_ftype) (hparams.ftype % GGML_QNT_VERSION_FACTOR); + } else + { + hparams.ftype = (enum llama_ftype) file.read_u32(); + } + + // hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + file.read_raw(&score, sizeof(score)); + } + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_load_tensor_shard shard; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + shard.type = (enum ggml_type) file.read_u32(); + shard.ne.resize(n_dims); + file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw std::runtime_error(format("falcon.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); + } + switch (shard.type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: { + throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type)); + } + } + + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + // skip to the next multiple of 32 bytes + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + } + shard.file_idx = file_idx; + shard.file_off = file.tell(); + + shard.calc_size(); + file.seek(shard.size, SEEK_CUR); + + auto it = tensors_map.name_to_idx.find(name); + size_t idx; + if (it != tensors_map.name_to_idx.end()) { + idx = it->second; + } else { + tensors_map.tensors.emplace_back(name); + idx = tensors_map.tensors.size() - 1; + tensors_map.name_to_idx.emplace(name, idx); + } + tensors_map.tensors.at(idx).shards.push_back(shard); + } + } +}; + +struct llama_file_saver { + llama_file file; + falcon_file_loader * any_file_loader; + llama_file_saver(const char * fname, falcon_file_loader * any_file_loader, enum llama_ftype new_ftype) + : file(fname, "wb"), any_file_loader(any_file_loader) { + fprintf(stderr, "falcon.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_ftype); + write_vocab(); + } + void write_magic() { + file.write_u32(LLAMA_FILE_MAGIC); // magic + file.write_u32(LLAMA_FILE_VERSION); // version + } + void write_hparams(enum llama_ftype new_ftype) { + const falcon_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_head_kv); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.version); + file.write_u32(new_ftype); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + fprintf(stderr, "falcon.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(falcon_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + switch (new_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: LLAMA_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(new_type); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_model_loader { + std::vector> file_loaders; + llama_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr mapping; + + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto * first_file = new falcon_file_loader(fname_base.c_str(), 0, tensors_map); + file_loaders.emplace_back(first_file); + uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); + for (uint32_t i = 1; i < n_parts; i++) { + std::string fname = fname_base + "." + std::to_string(i); + auto * ith_file = new falcon_file_loader(fname.c_str(), i, tensors_map); + file_loaders.emplace_back(ith_file); + if (ith_file->hparams != first_file->hparams) { + throw std::runtime_error(format("falcon.cpp: hparams inconsistent between files")); + } + } + if (!llama_mmap::SUPPORTED) { + use_mmap = false; + } + if (use_mmap && alignment_prevents_mmap()) { + fprintf(stderr, "falcon.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + use_mmap = false; + } + this->use_mmap = use_mmap; + for (falcon_load_tensor & lt : tensors_map.tensors) { + lt.calc_all(); + } + } + + bool alignment_prevents_mmap() { + for (const falcon_load_tensor & lt : tensors_map.tensors) { + for (const llama_load_tensor_shard & shard : lt.shards) { + if (shard.file_off & 3) { + return true; + } + } + } + return false; + } + + uint32_t guess_n_parts() const { + auto it = tensors_map.name_to_idx.find("transformer.word_embeddings.weight"); + if (it == tensors_map.name_to_idx.end()) { + throw std::runtime_error(std::string("missing tok_embeddings.weight")); + } + const falcon_load_tensor & lt = tensors_map.tensors.at(it->second); + return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const falcon_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *ctx_size_p += 64 * MB; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw std::runtime_error(std::runtime_error(format("falcon.cpp: tensor '%s' is missing from model", name.c_str()))); + } + falcon_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw std::runtime_error(format("falcon.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str())); + } + + return get_tensor_for(lt, backend); + } + + struct ggml_tensor * get_tensor_for(falcon_load_tensor & lt, ggml_backend backend) { + struct ggml_tensor * tensor; + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ggml_ctx, true); + } + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + ggml_set_name(tensor, lt.name.c_str()); + // printf("falcon.cpp: creating tensor %s\n", lt.name.c_str()); + LLAMA_ASSERT(lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor + + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ggml_ctx, use_mmap); + } + tensor->backend = backend; + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() const { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::runtime_error(std::string("falcon.cpp: file contained more tensors than expected")); + } + } + + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t data_size = 0; + size_t prefetch_size = 0; + size_t lock_size = 0; + for (const falcon_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + prefetch_size += lt.size; + } + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (falcon_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + + // allocate temp buffer if not using mmap + if (!use_mmap && lt.data == NULL) { + GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU); + lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor)); + } + + load_data_for(lt); + + switch(lt.ggml_tensor->backend) { + case GGML_BACKEND_CPU: + lt.ggml_tensor->data = lt.data; + if (use_mmap && lmlock) { + lock_size += lt.size; + lmlock->grow_to(lock_size); + } + break; +#if defined(GGML_USE_CUBLAS) + case GGML_BACKEND_GPU: + case GGML_BACKEND_GPU_SPLIT: + ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); + if (!use_mmap) { + free(lt.data); + } + break; +#elif defined(GGML_USE_CLBLAST) + case GGML_BACKEND_GPU: + ggml_cl_transform_tensor(lt.data, lt.ggml_tensor); + if (!use_mmap) { + free(lt.data); + } + break; +#endif + default: + continue; + } + + done_size += lt.size; + } + } + + void load_data_for(falcon_load_tensor & lt) { + if (use_mmap) { + LLAMA_ASSERT(lt.shards.size() == 1); + lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; + } else if (lt.split_type == SPLIT_NONE) { + llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + file.seek(lt.shards.at(0).file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } else if (lt.split_type == SPLIT_BY_ROWS) { + size_t offset = 0; + for (llama_load_tensor_shard & shard : lt.shards) { + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + file.read_raw(lt.data + offset, shard.size); + offset += shard.size; + } + LLAMA_ASSERT(offset == lt.size); + } else if (lt.split_type == SPLIT_BY_COLUMNS) { + // Let's load the data into temporary buffers to ensure the OS performs large loads. + std::vector tmp_bufs(lt.shards.size()); + for (size_t i = 0; i < lt.shards.size(); i++) { + llama_load_tensor_shard & shard = lt.shards.at(i); + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + tmp_bufs.at(i).resize(shard.size); + file.read_raw(tmp_bufs.at(i).addr, shard.size); + } + // Then reshape. + size_t num_rows = lt.ne.at(1); + size_t per_shard_row_size = lt.shards.at(0).size / num_rows; + size_t out_offset = 0; + for (size_t row = 0; row < num_rows; row++) { + for (llama_buffer & tmp_buf : tmp_bufs) { + memcpy(lt.data + out_offset, + tmp_buf.addr + row * per_shard_row_size, + per_shard_row_size); + out_offset += per_shard_row_size; + } + } + LLAMA_ASSERT(out_offset == lt.size); + } + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(falcon_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + + +// +// kv cache +// + +static bool kv_cache_init( + const struct falcon_hparams & hparams, + struct falcon_kv_cache & cache, + ggml_type wtype, + int n_ctx) { + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx; + const int64_t n_elements = n_embd*n_mem; + + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + struct ggml_init_params params; + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; + params.no_alloc = false; + + cache.ctx = ggml_init(params); + + if (!cache.ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + ggml_set_name(cache.k, "cache_k"); + ggml_set_name(cache.v, "cache_v"); + + return true; +} + +struct falcon_context_params falcon_context_default_params() { + struct falcon_context_params result = { + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.gpu_layers =*/ 0, + /*.main_gpu =*/ 0, + /*.tensor_split =*/ {0}, + /*.seed =*/ -1, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.vocab_only =*/ false, + /*.use_mmap =*/ true, + /*.use_mlock =*/ false, + /*.embedding =*/ false, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, + }; + + return result; +} + +struct llama_model_quantize_params llama_model_quantize_default_params() { + struct llama_model_quantize_params result = { + /*.nthread =*/ 0, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.allow_requantize =*/ false, + /*.quantize_output_tensor =*/ true, + }; + + return result; +} + +bool llama_mmap_supported() { + return llama_mmap::SUPPORTED; +} + +bool llama_mlock_supported() { + return llama_mlock::SUPPORTED; +} + +void llama_init_backend() { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } +} + +int64_t llama_time_us() { + return ggml_time_us(); +} + +// +// model loading +// + +static const char *llama_file_version_name(llama_file_version version) { + switch (version) { + case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; + case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; + } + + return "unknown"; +} + +static const char *llama_ftype_name(enum llama_ftype ftype) { + switch (ftype) { + case LLAMA_FTYPE_ALL_F32: return "all F32"; + case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + return "mostly Q4_1, some F16"; + case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; + case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + // K-quants + case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + default: return "unknown, may not work"; + } +} + +static const char *falcon_model_type_name(e_model type) { + switch (type) { + case FALCON_7B: return "7B"; + case FALCON_40B: return "40B"; + default: LLAMA_ASSERT(false); + } +} + +static void falcon_model_load_internal( + const std::string & fname, + falcon_context & lctx, + int n_ctx, + int n_batch, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + + lctx.t_start_us = ggml_time_us(); + + std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + + lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); + auto & model = lctx.model; + model.hparams = ml->file_loaders.at(0)->hparams; + model.n_gpu_layers = n_gpu_layers; + llama_file_version file_version = ml->file_loaders.at(0)->file_version; + auto & hparams = model.hparams; + + { + switch (hparams.n_layer) { + case 32: model.type = e_model::FALCON_7B; break; + case 40: model.type = e_model::FALCON_40B; break; + default: + { + if (hparams.version == 7) { + model.type = e_model::FALCON_7B; + } else + if (hparams.version == 40) { + model.type = e_model::FALCON_40B; + } else { + LLAMA_ASSERT(false); + } + } break; + } + + hparams.n_ctx = n_ctx; + } + + const uint32_t n_ff = 4 * model.hparams.n_embd; + + { + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: version = %u\n", __func__, hparams.version); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); + fprintf(stderr, "%s: model size = %s\n", __func__, falcon_model_type_name(model.type)); + } + + if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { + if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { + throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)")); + } + } + + if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { + if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)")); + } + } + + if (vocab_only) { + return; + } + + auto & ctx = model.ctx; + + size_t ctx_size; + size_t mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + fprintf(stderr, "%s: ggml ctx size = %7.2f MB (mmap size = %7.2f MB)\n", __func__, ctx_size/MB*1.0, mmapped_size/MB*1.0); + + + // create the ggml context + { + lctx.model.buf.resize(ctx_size); + if (use_mlock) { + lctx.model.mlock_buf.init(lctx.model.buf.addr); + lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + } + + struct ggml_init_params params = { + /*.mem_size =*/ lctx.model.buf.size, + /*.mem_buffer =*/ lctx.model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + throw std::runtime_error(format("ggml_init() failed")); + } + } + + (void) main_gpu; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#elif defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU +#else +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + // prepare memory for the weights + size_t vram_weights = 0; + size_t vram_scratch = 0; + { + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_head = hparams.n_head; + const uint32_t n_head_kv = hparams.n_head_kv; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_ff = 4 * model.hparams.n_embd; + const uint32_t n_vocab = hparams.n_vocab; + const uint32_t head_dim = hparams.n_embd / hparams.n_head; + + ml->ggml_ctx = ctx; + + model.tok_embeddings = ml->get_tensor("transformer.word_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + + + + ggml_backend backend_output; + if (n_gpu_layers > int(n_layer)) { // NOLINT + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_output = GGML_BACKEND_CPU; + } + + // "output" tensor + { + + model.output_norm = ml->get_tensor("transformer.ln_f.weight", {n_embd}, GGML_BACKEND_CPU); + model.output_norm_b = ml->get_tensor("transformer.ln_f.bias", {n_embd}, GGML_BACKEND_CPU); + model.lm_head = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + } + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + std::string layers_i = "layers." + std::to_string(i); + std::string str_i = std::to_string(i); + + if (model.type == FALCON_40B) + { + layer.input_layernorm = ml->get_tensor("transformer.h." + str_i +".ln_mlp.weight", {n_embd}, GGML_BACKEND_CPU); + layer.input_layernorm_b = ml->get_tensor("transformer.h." + str_i +".ln_mlp.bias", {n_embd}, GGML_BACKEND_CPU); + layer.attention_norm = ml->get_tensor("transformer.h." + str_i +".ln_attn.weight", {n_embd}, GGML_BACKEND_CPU); + layer.attention_norm_b = ml->get_tensor("transformer.h." + str_i +".ln_attn.bias", {n_embd}, GGML_BACKEND_CPU); + } else // FALCON_7B + { + layer.input_layernorm = ml->get_tensor("transformer.h." + str_i +".input_layernorm.weight", {n_embd}, backend); + layer.input_layernorm_b = ml->get_tensor("transformer.h." + str_i +".input_layernorm.bias", {n_embd}, backend); + } + + layer.query_key_value = ml->get_tensor("transformer.h." + str_i +".self_attention.query_key_value.weight", {n_embd, (n_head_kv * 2 + n_head) * head_dim}, GGML_BACKEND_CPU); + layer.wo = ml->get_tensor("transformer.h." + str_i +".self_attention.dense.weight", {n_embd, n_embd}, GGML_BACKEND_CPU); + + layer.ffn_up = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_h_to_4h.weight", {n_embd, n_ff}, GGML_BACKEND_CPU); // before gelu + layer.ffn_down = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_4h_to_h.weight", {n_ff, n_embd}, GGML_BACKEND_CPU); // after gelu + + if (backend == GGML_BACKEND_GPU) { + // llama: + // vram_weights += + // ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + // ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + // ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + // falcon: + if (model.type == FALCON_40B) + { + vram_weights += + ggml_nbytes(layer.input_layernorm) + ggml_nbytes(layer.input_layernorm_b) + + ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.attention_norm_b) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up); + } else // FALCON_7B + { + vram_weights += + ggml_nbytes(layer.input_layernorm) + ggml_nbytes(layer.input_layernorm_b) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up); + } + } + } + } + + // key + value memory + { + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head_kv = hparams.n_head_kv; + const int head_dim = hparams.n_embd / hparams.n_head; + + const int64_t n_mem = n_layer*n_ctx; + const int64_t n_elements = head_dim*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: (a) memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + ml->done_getting_tensors(); + + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + + // this is the total memory required to run the inference + const size_t mem_required = + ctx_size + + mmapped_size - vram_weights + // weights in VRAM not in memory + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at (model.type); + + // this is the memory required by one llama_state + const size_t mem_required_state = + scale*MEM_REQ_KV_SELF().at(model.type); + + fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + + (void) vram_scratch; + (void) n_batch; +#ifdef GGML_USE_CUBLAS + vram_scratch = n_batch * MB; + ggml_cuda_set_scratch_size(vram_scratch); + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", + __func__, vram_scratch / MB); + } +#endif // GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: offloading output layer to GPU\n", __func__); + } + fprintf(stderr, "%s: total VRAM used: %zu MB\n", + __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up +#else + (void) n_gpu_layers; +#endif + } + + // populate `tensors_by_name` + for (falcon_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + } + + (void) tensor_split; +#if defined(GGML_USE_CUBLAS) + { + ggml_cuda_set_tensor_split(tensor_split); + } +#endif + + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + + model.mapping = std::move(ml->mapping); + + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + lctx.t_load_us = ggml_time_us() - lctx.t_start_us; +} + +static bool falcon_model_load( + const std::string & fname, + falcon_context & lctx, + int n_ctx, + int n_batch, + int n_gpu_layers, + int main_gpu, + float * tensor_split, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + falcon_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type, + use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::exception & err) { + fprintf(stderr, "error loading model: %s\n", err.what()); + return false; + } +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_past: the context size so far +// - n_threads: number of threads to use +// - cgraph_fname: filename of the exported computation graph +// +static bool falcon_eval_internal( + falcon_context & lctx, + const llama_token * tokens, + const int n_tokens, + const int n_past, + const int n_threads, + const char * cgraph_fname) { + + // enforce that the first token is BOS + if (n_past == 0 && tokens[0] != falcon_token_bos()) { + fprintf(stderr, "%s: first token must be BOS\n", __func__); + return false; + } + + const int64_t t_start_us = ggml_time_us(); + + const int N = n_tokens; + //const int N = embd_inp.size(); + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = model.kv_self; + + LLAMA_ASSERT(!!kv_self.ctx); + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_head_kv = hparams.n_head_kv; + const int n_vocab = hparams.n_vocab; + const int version = hparams.version; + const int n_gpu_layers = model.n_gpu_layers; + const size_t head_dim = n_embd / n_head; // == n_rot in llama + + auto & mem_per_token = lctx.mem_per_token; + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + ggml_cgraph gf = {}; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head); + + struct ggml_tensor * layernorm_output; + + // ggml_type wtype = GGML_TYPE_F32; + // ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + // const int sizeof_wtype = ggml_type_sizef(wtype); + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + for (int il = 0; il < n_layer; ++il) { + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * inpSA = inpL; + + lctx.use_buf(ctx0, 0); + + + // self-attention + { + layernorm_output = ggml_norm(ctx0, inpL); + + layernorm_output = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].input_layernorm, layernorm_output), + layernorm_output), + ggml_repeat(ctx0, model.layers[il].input_layernorm_b, layernorm_output)); + ggml_set_name(layernorm_output, "layernorm_output"); + + if (model.type == FALCON_40B || version == 40) + { + cur = ggml_norm(ctx0, inpL); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].attention_norm, cur), + cur), + ggml_repeat(ctx0, model.layers[il].attention_norm_b, cur)); + } else { + cur = layernorm_output; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.layers[il].query_key_value, cur); + + // Note that the strides for Kcur, Vcur are set up so that the + // resulting views are misaligned with the tensor's storage + // (by applying the K/V offset we shift the tensor's original + // view to stick out behind the viewed QKV tensor's allocated + // memory, so to say). This is ok because no actual accesses + // happen to that out-of-range memory, but it can require some + // trickery when trying to accurately dump these views for + // debugging. + + struct ggml_tensor * Qcur = ggml_view_3d( + ctx0, cur, head_dim, n_head, N, + head_dim * ggml_element_size(cur), + head_dim * (n_head + 2 * n_head_kv) * ggml_element_size(cur), + 0); + ggml_set_name(Qcur, "Qcur"); + + struct ggml_tensor * Kcur = ggml_view_3d( + ctx0, cur, head_dim, n_head_kv, N, + head_dim * ggml_element_size(cur), + head_dim * (n_head + 2 * n_head_kv) * ggml_element_size(cur), + head_dim * n_head * ggml_element_size(cur)); + ggml_set_name(Kcur, "Kcur"); + + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, cur, head_dim, n_head_kv, N, + head_dim * ggml_element_size(cur), + head_dim * (n_head + 2 * n_head_kv) * ggml_element_size(cur), + head_dim * (n_head + n_head_kv) * ggml_element_size(cur)); + ggml_set_name(Vcur, "Vcur"); + + // using mode = 2 for neox mode + Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2); + Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2); + + // store key and value to memory + { + struct ggml_tensor* k = ggml_view_1d( + ctx0, model.memory_k, N * n_head_kv * head_dim, + (ggml_element_size(model.memory_k) * n_head_kv * head_dim) * + (il * n_ctx + n_past)); + ggml_set_name(k, "k"); + struct ggml_tensor* v = ggml_view_1d( + ctx0, model.memory_v, N * n_head_kv * head_dim, + (ggml_element_size(model.memory_v) * n_head_kv * head_dim) * + (il * n_ctx + n_past)); + ggml_set_name(v, "v"); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * K = ggml_permute( + ctx0, + ggml_reshape_3d( + ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim, + il * n_ctx * + ggml_element_size(model.memory_k) * + n_head_kv * + head_dim), + head_dim, n_head_kv, n_past + N), + 0, 2, 1, 3); + + // K * Q + + K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy)); + ggml_set_name(K, "K"); + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim))) + ); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor* V = ggml_permute( + ctx0, + ggml_reshape_3d( + ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim, + il * n_ctx * + ggml_element_size(model.memory_v) * + n_head_kv * + head_dim), + head_dim, n_head_kv, n_past + N), + 0, 2, 1, 3); + + V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy))); + ggml_set_name(V, "V"); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection + { + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + // offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + } // end of attention + + lctx.use_buf(ctx0, 1); + //ggml_cuda_set_scratch(1); + + + struct ggml_tensor* inpFF = layernorm_output; + ggml_set_name(inpFF, "inpFF"); + struct ggml_tensor* attn_out = ggml_cpy( + ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + //offload_func(attn_out); + ggml_set_name(attn_out, "attn_out"); + { + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up, inpFF); + //offload_func(cur); + ggml_set_name(cur, "inpFF*ff_up"); + cur = ggml_gelu(ctx0, cur); + //offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); + //offload_func(cur); + ggml_set_name(cur, "gelu_cur*ff_down"); + } + + cur = ggml_add(ctx0, cur, attn_out); + cur = ggml_add(ctx0, cur, inpL); + ggml_set_name(cur, "inpFF_+_result_attn_out"); + // input for next layer + inpL = cur; + } // end of layer loop + + lctx.use_buf(ctx0, 0); + //ggml_cuda_set_scratch(0); + + // used at the end to optionally extract the embeddings + struct ggml_tensor * embeddings = NULL; + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU + } +#endif // GGML_USE_CUBLAS + + // norm + { + cur = ggml_norm(ctx0, cur); + // offload_func(cur); + ggml_set_name(cur, "norm_cur"); + + // inpL = ln_f_g*inpL + ln_f_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.output_norm, cur), + cur), + ggml_repeat(ctx0, model.output_norm_b, cur)); + // offload_func(cur); + ggml_set_name(cur, "result_norm"); + + embeddings = cur; + } + + + // language modelling head + cur = ggml_mul_mat(ctx0, model.lm_head, cur); + //offload_func(cur); + ggml_set_name(cur, "result_lm_head"); + + // cur = ggml_mul_mat(ctx0, model.output, cur); + // ggml_set_name(cur, "result_output"); + + lctx.use_buf(ctx0, -1); + + // logits -> probs + //cur = ggml_soft_max_inplace(ctx0, cur); + + // run the computation + ggml_build_forward_expand(&gf, cur); + +#ifdef GGML_USE_METAL + if (lctx.ctx_metal && N == 1) { + ggml_metal_graph_compute(lctx.ctx_metal, &gf); + ggml_metal_get_tensor (lctx.ctx_metal, cur); + } else { + // IMPORTANT: + // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla + // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX + // coprocessor. + // + // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. + // But for now, we have focused only on Matrix x Vector Metal multiplication. + // + // TODO: avoid these syncs via shared memory (ref #1696) + // + if (lctx.ctx_metal) { + // We need to sync the GPU KV cache with the CPU KV cache + ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); + ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); + } + + ggml_graph_compute(ctx0, &gf); + } +#else + ggml_graph_compute(ctx0, &gf); +#endif + + if (cgraph_fname) { + ggml_graph_export(&gf, cgraph_fname); + } + +#ifdef GGML_PERF + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + ggml_graph_print(&gf); +#endif + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); + + // update kv token count + lctx.model.kv_self.n = n_past + N; + + // extract logits + { + auto & logits_out = lctx.logits; + + if (lctx.logits_all) { + logits_out.resize(n_vocab * N); + memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N); + } else { + // return result for just the last token + logits_out.resize(n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + } + } + + // extract embeddings + if (!lctx.embedding.empty()) { + auto & embedding_out = lctx.embedding; + + embedding_out.resize(n_embd); + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + } + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + +#if 0 + printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0); +#endif + + ggml_free(ctx0); + + // measure the performance only for the single-token evals + if (N == 1) { + lctx.t_eval_us += ggml_time_us() - t_start_us; + lctx.n_eval++; + } + else if (N > 1) { + lctx.t_p_eval_us += ggml_time_us() - t_start_us; + lctx.n_p_eval += N; + } + + return true; +} + +// +// tokenizer +// + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +struct llama_sp_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + +static_assert(std::is_trivially_copyable::value, "llama_sp_symbol is not trivially copyable"); + +struct llama_sp_bigram { + struct comparator { + bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llama_sp_symbol::index left; + llama_sp_symbol::index right; + float score; + size_t size; +}; + +// original implementation: +// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 +struct llama_tokenizer { + llama_tokenizer(const falcon_vocab & vocab): vocab_(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + // split string into utf8 chars + int index = 0; + size_t offs = 0; + while (offs < text.size()) { + llama_sp_symbol sym; + size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); + sym.text = text.c_str() + offs; + sym.n = char_len; + offs += char_len; + sym.prev = index - 1; + sym.next = offs == text.size() ? -1 : index + 1; + index++; + symbols_.emplace_back(sym); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols_.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue_.empty()) { + auto bigram = work_queue_.top(); + work_queue_.pop(); + + auto & left_sym = symbols_[bigram.left]; + auto & right_sym = symbols_[bigram.right]; + + // if one of the symbols already got merged, skip it. + if (left_sym.n == 0 || right_sym.n == 0 || + left_sym.n + right_sym.n != bigram.size) { + continue; + } + + // merge the right sym into the left one + left_sym.n += right_sym.n; + right_sym.n = 0; + + //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols_[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + for (int i = 0; i != -1; i = symbols_[i].next) { + auto & symbol = symbols_[i]; + auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n)); + + if (token == vocab_.token_to_id.end()) { + // output any symbols that did not form tokens as bytes. + for (int j = 0; j < (int) symbol.n; ++j) { + falcon_vocab::id token_id = static_cast(symbol.text[j]) + 3; + output.push_back(token_id); + } + } else { + output.push_back((*token).second); + } + } + } + +private: + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n); + auto token = vocab_.token_to_id.find(text); + + if (token == vocab_.token_to_id.end()) { + return; + } + + if (static_cast((*token).second) >= vocab_.id_to_token.size()) { + return; + } + + const auto &tok_score = vocab_.id_to_token[(*token).second]; + + llama_sp_bigram bigram; + bigram.left = left; + bigram.right = right; + bigram.score = tok_score.score; + bigram.size = text.size(); + work_queue_.push(bigram); + } + + const falcon_vocab & vocab_; + std::vector symbols_; + llama_sp_bigram::queue work_queue_; +}; + +static std::vector falcon_tokenize(const falcon_vocab & vocab, const std::string & text, bool bos) { + llama_tokenizer tokenizer(vocab); + std::vector output; + + if (text.empty()) { + return output; + } + + if (bos) { + output.push_back(falcon_token_bos()); + } + + tokenizer.tokenize(text, output); + return output; +} + +// +// sampling +// + +void llama_sample_softmax(struct falcon_context * ctx, llama_token_data_array * candidates) { + assert(candidates->size > 0); + + const int64_t t_start_sample_us = ggml_time_us(); + + // Sort the logits in descending order + if (!candidates->sorted) { + std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + candidates->sorted = true; + } + + float max_l = candidates->data[0].logit; + float cum_sum = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + float p = expf(candidates->data[i].logit - max_l); + candidates->data[i].p = p; + cum_sum += p; + } + for (size_t i = 0; i < candidates->size; ++i) { + candidates->data[i].p /= cum_sum; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_top_k(struct falcon_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { + const int64_t t_start_sample_us = ggml_time_us(); + + k = std::max(k, (int) min_keep); + k = std::min(k, (int) candidates->size); + + // Sort scores in descending order + if (!candidates->sorted) { + auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + if (k == (int) candidates->size) { + std::sort(candidates->data, candidates->data + candidates->size, comp); + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } + candidates->sorted = true; + } + candidates->size = k; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_top_p(struct falcon_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + if (p >= 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(ctx, candidates); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + + for (size_t i = 0; i < candidates->size; ++i) { + cum_sum += candidates->data[i].p; + + // Check if the running sum is greater than p or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep) { + last_idx = i; + break; + } + } + + // Resize the output vector to keep only the top-p tokens + candidates->size = last_idx; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_tail_free(struct falcon_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { + if (z >= 1.0f || candidates->size <= 2) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(nullptr, candidates); + + // Compute the first and second derivatives + std::vector first_derivatives(candidates->size - 1); + std::vector second_derivatives(candidates->size - 2); + + for (size_t i = 0; i < first_derivatives.size(); ++i) { + first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; + } + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; + } + + // Calculate absolute value of second derivatives + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = abs(second_derivatives[i]); + } + + // Normalize the second derivatives + float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + for (size_t i = 0; i < second_derivatives.size(); ++i) { + cum_sum += second_derivatives[i]; + + // Check if the running sum is greater than z or if we have kept at least min_keep tokens + if (cum_sum > z && i >= min_keep) { + last_idx = i; + break; + } + } + + // Resize the output vector to keep only the tokens above the tail location + candidates->size = last_idx; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + + +void llama_sample_typical(struct falcon_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (p >= 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + // Compute the softmax of logits and calculate entropy + llama_sample_softmax(nullptr, candidates); + + float entropy = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + entropy += -candidates->data[i].p * logf(candidates->data[i].p); + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < candidates->size; ++i) { + float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(candidates->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += candidates->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep - 1) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector new_candidates; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + new_candidates.push_back(candidates->data[idx]); + } + + // Replace the data in candidates with the new_candidates data + std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); + candidates->size = new_candidates.size(); + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_temperature(struct falcon_context * ctx, llama_token_data_array * candidates_p, float temp) { + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= temp; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_repetition_penalty(struct falcon_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { + if (last_tokens_size == 0 || penalty == 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates->size; ++i) { + const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); + if (token_iter == last_tokens + last_tokens_size) { + continue; + } + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (candidates->data[i].logit <= 0) { + candidates->data[i].logit *= penalty; + } else { + candidates->data[i].logit /= penalty; + } + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_frequency_and_presence_penalties(struct falcon_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { + if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + // Create a frequency map to count occurrences of each token in last_tokens + std::unordered_map token_count; + for (size_t i = 0; i < last_tokens_size; ++i) { + token_count[last_tokens_p[i]]++; + } + + // Apply frequency and presence penalties to the candidates + for (size_t i = 0; i < candidates->size; ++i) { + auto token_iter = token_count.find(candidates->data[i].id); + if (token_iter == token_count.end()) { + continue; + } + + int count = token_iter->second; + candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + + +llama_token llama_sample_token_mirostat(struct falcon_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { + assert(ctx); + auto N = float(falcon_n_vocab(ctx)); + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(nullptr, candidates); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); + + // Sample the next word X using top-k sampling + llama_sample_top_k(nullptr, candidates, int(k), 1); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + llama_token X = llama_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + } + return X; +} + +llama_token llama_sample_token_mirostat_v2(struct falcon_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { + assert(ctx); + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(ctx, candidates); + + // Truncate the words with surprise values greater than mu + candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return -log2f(candidate.p) > *mu; + })); + + if (candidates->size == 0) { + candidates->size = 1; + } + + // Normalize the probabilities of the remaining words + llama_sample_softmax(ctx, candidates); + + // Sample the next word X from the remaining words + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + llama_token X = llama_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + return X; +} + +llama_token llama_sample_token_greedy(struct falcon_context * ctx, llama_token_data_array * candidates) { + const int64_t t_start_sample_us = ggml_time_us(); + + // Find max element + auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit < b.logit; + }); + + llama_token result = max_iter->id; + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + } + return result; +} + +llama_token llama_sample_token(struct falcon_context * ctx, llama_token_data_array * candidates) { + assert(ctx); + const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(nullptr, candidates); + + std::vector probs; + probs.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + probs.push_back(candidates->data[i].p); + } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + auto & rng = ctx->rng; + int idx = dist(rng); + + llama_token result = candidates->data[idx].id; + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + return result; +} + +// +// quantization +// + +static void llama_convert_tensor_internal(const falcon_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) { + if (output.size < nelements * sizeof(float)) { + output.resize(nelements * sizeof(float)); + } + float * f32_output = (float *) output.addr; + + quantize_fns_t qtype; + if (ggml_is_quantized(tensor.type)) { + qtype = ggml_internal_get_quantize_fn(tensor.type); + if (qtype.dequantize_row_q == NULL) { + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + } + } else if (tensor.type != GGML_TYPE_F16) { + throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + } + + if (nthread < 2) { + if (tensor.type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); + } else if (ggml_is_quantized(tensor.type)) { + qtype.dequantize_row_q(tensor.data, f32_output, nelements); + } else { + LLAMA_ASSERT(false); // unreachable + } + return; + } + + auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); + auto block_size_bytes = ggml_type_size(tensor.type); + + LLAMA_ASSERT(nelements % block_size == 0); + auto nblocks = nelements / block_size; + auto blocks_per_thread = nblocks / nthread; + auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count + + std::vector workers; + for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { + auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + auto thr_elems = thr_blocks * block_size; // number of elements for this thread + auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + + auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else { + qtype.dequantize_row_q(inbuf, outbuf, nels); + } + }; + workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + in_buff_offs += thr_block_bytes; + out_buff_offs += thr_elems; + } + for (auto & worker : workers) { + worker.join(); + } + +} + +static void falcon_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { + ggml_type quantized_type; + llama_ftype ftype = params->ftype; + int nthread = params->nthread; + + switch (params->ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; + case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; + case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; + case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; + case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; + +#ifdef GGML_USE_K_QUANTS + // K-quants + case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: + case LLAMA_FTYPE_MOSTLY_Q3_K_M: + case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: + case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; +#endif + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); + } + + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + + std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, + /*vocab_only*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); + +#ifdef GGML_USE_K_QUANTS + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + for (auto& tensor : model_loader->tensors_map.tensors) { + if (tensor.name.find("attention.wv.weight") != std::string::npos) { + ++n_attention_wv; + } + else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + ++n_feed_forward_w2; + } + } + + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; +#endif + + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); + + std::vector workers; + std::mutex mutex; + + size_t idx = 0; + for (falcon_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); + + printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + ggml_type_name(tensor.type)); + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); + quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; + quantize &= quantized_type != tensor.type; + + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_buffer work; + + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + printf("(Not quantizing) size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; +#ifdef GGML_USE_K_QUANTS + // if (tensor.name == ".mlp.dense_") { + // new_type = GGML_TYPE_Q6_K; + // } + // TODO falcon +#endif + + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_buffer f32_conv_buf; + + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); + } else { + llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); + f32_data = (float *) f32_conv_buf.addr; + } + + printf("quantizing .. "); + fflush(stdout); + + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + int64_t tot_count = 0; + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; + tot_count += hist_cur[i]; + } + + if (tot_count > 0) { + for (size_t i = 0; i < hist_cur.size(); i++) { + printf("%5.3f ", hist_cur[i] / float(nelements)); + } + } + printf("\n"); + } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } + + if (sum_all > 0) { + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + printf("%5.3f ", hist_all[i] / float(sum_all)); + } + printf("\n"); + } + } +} + +// +// interface implementation +// + +struct falcon_context * falcon_init_from_file( + const char * path_model, + struct falcon_context_params params) { + ggml_time_init(); + + falcon_context * ctx = new falcon_context; + + if (params.seed < 0) { + params.seed = time(NULL); + } + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + fprintf(stderr, "."); + fflush(stderr); + if (percentage >= 100) { + fprintf(stderr, "\n"); + } + } + }; + } + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!falcon_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock, + params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + fprintf(stderr, "%s: failed to load model\n", __func__); + llama_free(ctx); + return nullptr; + } + + // reserve memory for context buffers + if (!params.vocab_only) { + if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { + fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); + llama_free(ctx); + return nullptr; + } + + { + const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + } + + const auto & hparams = ctx->model.hparams; + + // resized during inference + if (params.logits_all) { + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + } else { + ctx->logits.reserve(hparams.n_vocab); + } + + if (params.embedding){ + ctx->embedding.resize(hparams.n_embd); + } + + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); + } + +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + // this allocates all Metal resources and memory buffers + ctx->ctx_metal = ggml_metal_init(); + + void *data_ptr = NULL; + size_t data_size = 0; + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size= ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size= ggml_get_mem_size(ctx->model.ctx); + } + +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + fprintf(stderr, "%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ + } + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size)); + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size)); +#undef LLAMA_METAL_CHECK_BUF + } +#endif + + return ctx; +} + +void llama_free(struct falcon_context * ctx) { + delete ctx; +} + +int falcon_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_model_quantize_params *params) { + try { + falcon_model_quantize_internal(fname_inp, fname_out, params); + return 0; + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_apply_lora_from_file_internal(struct falcon_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + auto & model = ctx->model; + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != LLAMA_FILE_MAGIC_GGLA) { + fprintf(stderr, "%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_buffer base_buf; + if (path_base_model) { + fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + + size_t ctx_size; + size_t mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; + } + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + std::string name; + { + char buf[1024]; + fin.read(buf, length); + name = std::string(buf, length); + } + + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } + + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + + if (model_tensors.find(base_name) == model_tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + fprintf(stderr, "%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor* lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); + + lora_tensors[name] = lora_tensor; + + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + + ggml_tensor * dest_t = model_tensors[base_name]; + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + falcon_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; + } + + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + r = ggml_cpy(lora_ctx, r, dest_t); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) { + fprintf(stderr, "."); + } + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +int llama_apply_lora_from_file(struct falcon_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_get_kv_cache_token_count(const struct falcon_context * ctx) { + return ctx->model.kv_self.n; +} + +#define LLAMA_MAX_RNG_STATE (64*1024) + +void llama_set_rng_seed(struct falcon_context * ctx, int seed) { + if (seed < 0) { + seed = time(NULL); + } + ctx->rng.seed(seed); +} + +// Returns the *maximum* size of the state +size_t llama_get_state_size(const struct falcon_context * ctx) { + // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. + // for reference, std::mt19937(1337) serializes to 6701 bytes. + const size_t s_rng_size = sizeof(size_t); + const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_logits_capacity = sizeof(size_t); + const size_t s_logits_size = sizeof(size_t); + const size_t s_logits = ctx->logits.capacity() * sizeof(float); + const size_t s_embedding_size = sizeof(size_t); + const size_t s_embedding = ctx->embedding.size() * sizeof(float); + const size_t s_kv_size = sizeof(size_t); + const size_t s_kv_ntok = sizeof(int); + const size_t s_kv = ctx->model.kv_self.buf.size; + + const size_t s_total = ( + + s_rng_size + + s_rng + + s_logits_capacity + + s_logits_size + + s_logits + + s_embedding_size + + s_embedding + + s_kv_size + + s_kv_ntok + + s_kv + ); + + return s_total; +} + +// Copies the state to the specified destination address +size_t llama_copy_state_data(struct falcon_context * ctx, uint8_t * dst) { + uint8_t * out = dst; + + // copy rng + { + std::stringstream rng_ss; + rng_ss << ctx->rng; + + const size_t rng_size = rng_ss.str().size(); + char rng_buf[LLAMA_MAX_RNG_STATE]; + + memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); + memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); + + memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); + memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE; + } + + // copy logits + { + const size_t logits_cap = ctx->logits.capacity(); + const size_t logits_size = ctx->logits.size(); + + memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap); + memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size); + + if (logits_size) { + memcpy(out, ctx->logits.data(), logits_size * sizeof(float)); + } + + out += logits_cap * sizeof(float); + } + + // copy embeddings + { + const size_t embedding_size = ctx->embedding.size(); + + memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size); + + if (embedding_size) { + memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); + out += embedding_size * sizeof(float); + } + } + + // copy kv cache + { + const auto & kv_self = ctx->model.kv_self; + const auto & hparams = ctx->model.hparams; + const int n_layer = hparams.n_layer; + const int n_embd = hparams.n_embd; + const int n_ctx = hparams.n_ctx; + + const size_t kv_size = kv_self.buf.size; + const int kv_ntok = llama_get_kv_cache_token_count(ctx); + + memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); + memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); + + if (kv_size) { + const size_t elt_size = ggml_element_size(kv_self.k); + + char buffer[4096]; + + ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_cgraph gf{}; + gf.n_threads = 1; + + ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + kout3d->data = out; + out += ggml_nbytes(kout3d); + + ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + vout3d->data = out; + out += ggml_nbytes(vout3d); + + ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + n_embd, kv_ntok, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + + ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + kv_ntok, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); + ggml_graph_compute(cpy_ctx, &gf); + + ggml_free(cpy_ctx); + } + } + + const size_t written = out - dst; + const size_t max_size = llama_get_state_size(ctx); + + LLAMA_ASSERT(written <= max_size); + + return written; +} + +// Sets the state reading from the specified source address +size_t llama_set_state_data(struct falcon_context * ctx, uint8_t * src) { + uint8_t * inp = src; + + // set rng + { + size_t rng_size; + char rng_buf[LLAMA_MAX_RNG_STATE]; + + memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); + memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; + + std::stringstream rng_ss; + rng_ss.str(std::string(&rng_buf[0], rng_size)); + rng_ss >> ctx->rng; + + LLAMA_ASSERT(rng_ss.fail() == false); + } + + // set logits + { + size_t logits_cap; + size_t logits_size; + + memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); + memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); + + LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + + if (logits_size) { + ctx->logits.resize(logits_size); + memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); + } + + inp += logits_cap * sizeof(float); + } + + // set embeddings + { + size_t embedding_size; + + memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); + + LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + + if (embedding_size) { + memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); + inp += embedding_size * sizeof(float); + } + } + + // set kv cache + { + const auto & kv_self = ctx->model.kv_self; + const auto & hparams = ctx->model.hparams; + const int n_layer = hparams.n_layer; + const int n_embd = hparams.n_embd; + const int n_ctx = hparams.n_ctx; + + size_t kv_size; + int kv_ntok; + + memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); + memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); + + if (kv_size) { + LLAMA_ASSERT(kv_self.buf.size == kv_size); + + const size_t elt_size = ggml_element_size(kv_self.k); + + char buffer[4096]; + + ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_cgraph gf{}; + gf.n_threads = 1; + + ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + kin3d->data = (void *) inp; + inp += ggml_nbytes(kin3d); + + ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + vin3d->data = (void *) inp; + inp += ggml_nbytes(vin3d); + + ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + n_embd, kv_ntok, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + + ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + kv_ntok, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); + ggml_graph_compute(cpy_ctx, &gf); + + ggml_free(cpy_ctx); + } + + ctx->model.kv_self.n = kv_ntok; + } + + const size_t nread = inp - src; + const size_t max_size = llama_get_state_size(ctx); + + LLAMA_ASSERT(nread <= max_size); + + return nread; +} + +bool llama_load_session_file(struct falcon_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + + falcon_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(falcon_hparams)); + + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + const size_t n_state_size_max = llama_get_state_size(ctx); + + if (n_state_size_cur > n_state_size_max) { + fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + return false; + } + + std::vector state_data(n_state_size_max); + file.read_raw(state_data.data(), n_state_size_cur); + + llama_set_state_data(ctx, state_data.data()); + } + + return true; +} + +bool llama_save_session_file(struct falcon_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + file.write_raw(&ctx->model.hparams, sizeof(falcon_hparams)); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state + { + const size_t n_state_size_max = llama_get_state_size(ctx); + + std::vector state_data(n_state_size_max); + const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data()); + + file.write_raw(state_data.data(), n_state_size_cur); + } + + return true; +} + +int falcon_eval( + struct falcon_context * ctx, + const llama_token * tokens, + int n_tokens, + int n_past, + int n_threads) { + if (!falcon_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + +int falcon_eval_export(struct falcon_context * ctx, const char * fname) { + const int n_batch = 1; + const int n_ctx = 512 - n_batch; + + const std::vector tmp(n_batch, falcon_token_bos()); + + if (!falcon_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + return 0; +} + +int falcon_tokenize( + struct falcon_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + auto res = falcon_tokenize(ctx->vocab, text, add_bos); + + if (n_max_tokens < (int) res.size()) { + fprintf(stderr, "%s: too many tokens\n", __func__); + return -((int) res.size()); + } + + for (size_t i = 0; i < res.size(); i++) { + tokens[i] = res[i]; + } + + return res.size(); +} + +int falcon_n_vocab(const struct falcon_context * ctx) { + return ctx->vocab.id_to_token.size(); +} + +int falcon_n_ctx(const struct falcon_context * ctx) { + return ctx->model.hparams.n_ctx; +} + +int falcon_n_embd(const struct falcon_context * ctx) { + return ctx->model.hparams.n_embd; +} + +int falcon_get_vocab( + const struct falcon_context * ctx, + const char * * strings, + float * scores, + int capacity) { + int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); + for (int i = 0; ivocab.id_to_token[i].tok.c_str(); + scores[i] = ctx->vocab.id_to_token[i].score; + } + return n; +} + +float * falcon_get_logits(struct falcon_context * ctx) { + return ctx->logits.data(); +} + +float * falcon_get_embeddings(struct falcon_context * ctx) { + return ctx->embedding.data(); +} + +const char * falcon_token_to_str(const struct falcon_context * ctx, llama_token token) { + if (token >= falcon_n_vocab(ctx)) { + return nullptr; + } + + return ctx->vocab.id_to_token[token].tok.c_str(); +} + +llama_token falcon_token_bos() { + return 1; +} + +llama_token falcon_token_eos() { + return 2; +} + +llama_token falcon_token_nl() { + return 193; +} + +llama_token falcon_token_cr() { + return 195; +} + +void falcon_print_timings(struct falcon_context * ctx) { + const int64_t t_end_us = ggml_time_us(); + + const int32_t n_sample = std::max(1, ctx->n_sample); + const int32_t n_eval = std::max(1, ctx->n_eval); + const int32_t n_p_eval = std::max(1, ctx->n_p_eval); + + fprintf(stderr, "\n"); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); + fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); + fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval); + fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); +} + +void llama_reset_timings(struct falcon_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_sample_us = ctx->n_sample = 0; + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; +} + +const char * falcon_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + + return s.c_str(); +} + +// For internal test use +std::vector>& llama_internal_get_tensor_map(struct falcon_context * ctx) { + return ctx->model.tensors_by_name; +} diff --git a/libfalcon.h b/libfalcon.h new file mode 100644 index 000000000..739e12dab --- /dev/null +++ b/libfalcon.h @@ -0,0 +1,317 @@ +#ifndef FALCON_H +#define FALCON_H + +#include "ggml.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES +#else +#define LLAMA_MAX_DEVICES 1 +#endif // GGML_USE_CUBLAS +#include +#include +#include + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define LLAMA_API __declspec(dllexport) +# else +# define LLAMA_API __declspec(dllimport) +# endif +# else +# define LLAMA_API __attribute__ ((visibility ("default"))) +# endif +#else +# define LLAMA_API +#endif + +#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' + +#define LLAMA_FILE_VERSION 3 +#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT +#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML +#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN +#define LLAMA_SESSION_VERSION 1 + +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +// Defined when llama.cpp is compiled with support for offloading model layers to GPU. +#define LLAMA_SUPPORTS_GPU_OFFLOAD +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + // + // C interface + // + // TODO: show sample usage + // + + struct falcon_context; + + typedef int llama_token; + + typedef struct llama_token_data { + llama_token id; // token id + float logit; // log-odds of the token + float p; // probability of the token + } llama_token_data; + + typedef struct llama_token_data_array { + llama_token_data * data; + size_t size; + bool sorted; + } llama_token_data_array; + + typedef void (*llama_progress_callback)(float progress, void *ctx); + + struct falcon_context_params { + int n_ctx; // text context + int n_batch; // prompt processing batch size + int n_gpu_layers; // number of layers to store in VRAM + int main_gpu; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + int seed; // RNG seed, -1 for random + + bool f16_kv; // use fp16 for KV cache + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool embedding; // embedding mode only + + // called with a progress value between 0 and 1, pass NULL to disable + llama_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + }; + + // model file types + enum llama_ftype { + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + }; + + // model quantization parameters + typedef struct llama_model_quantize_params { + int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + } llama_model_quantize_params; + + LLAMA_API struct falcon_context_params falcon_context_default_params(); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); + + LLAMA_API bool llama_mmap_supported(); + LLAMA_API bool llama_mlock_supported(); + + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // Call once at the start of the program + LLAMA_API void llama_init_backend(); + + LLAMA_API int64_t llama_time_us(); + + // Various functions for loading a ggml llama model. + // Allocate (almost) all memory needed for the model. + // Return NULL on failure + LLAMA_API struct falcon_context * falcon_init_from_file( + const char * path_model, + struct falcon_context_params params); + + // Frees all allocated memory + LLAMA_API void llama_free(struct falcon_context * ctx); + + // Returns 0 on success + LLAMA_API int falcon_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_model_quantize_params * params); + + // Apply a LoRA adapter to a loaded model + // path_base_model is the path to a higher quality model to use as a base for + // the layers modified by the adapter. Can be NULL to use the current loaded model. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter + // will be applied on top of the previous one + // Returns 0 on success + LLAMA_API int llama_apply_lora_from_file( + struct falcon_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); + + // Returns the number of tokens in the KV cache + LLAMA_API int llama_get_kv_cache_token_count(const struct falcon_context * ctx); + + // Sets the current rng seed. + LLAMA_API void llama_set_rng_seed(struct falcon_context * ctx, int seed); + + // Returns the maximum size in bytes of the state (rng, logits, embedding + // and kv_cache) - will often be smaller after compacting tokens + LLAMA_API size_t llama_get_state_size(const struct falcon_context * ctx); + + // Copies the state to the specified destination address. + // Destination needs to have allocated enough memory. + // Returns the number of bytes copied + LLAMA_API size_t llama_copy_state_data(struct falcon_context * ctx, uint8_t * dst); + + // Set the state reading from the specified address + // Returns the number of bytes read + LLAMA_API size_t llama_set_state_data(struct falcon_context * ctx, uint8_t * src); + + // Save/load session file + LLAMA_API bool llama_load_session_file(struct falcon_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API bool llama_save_session_file(struct falcon_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + + // Run the llama inference to obtain the logits and probabilities for the next token. + // tokens + n_tokens is the provided batch of new tokens to process + // n_past is the number of tokens to use from previous eval calls + // Returns 0 on success + LLAMA_API int falcon_eval( + struct falcon_context * ctx, + const llama_token * tokens, + int n_tokens, + int n_past, + int n_threads); + + // Export a static computation graph for context of 511 and batch size of 1 + // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these + // parameters here to keep things simple + // IMPORTANT: do not use for anything else other than debugging and testing! + LLAMA_API int falcon_eval_export(struct falcon_context * ctx, const char * fname); + + // Convert the provided text into tokens. + // The tokens pointer must be large enough to hold the resulting tokens. + // Returns the number of tokens on success, no more than n_max_tokens + // Returns a negative number on failure - the number of tokens that would have been returned + // TODO: not sure if correct + LLAMA_API int falcon_tokenize( + struct falcon_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + + LLAMA_API int falcon_n_vocab(const struct falcon_context * ctx); + LLAMA_API int falcon_n_ctx (const struct falcon_context * ctx); + LLAMA_API int falcon_n_embd (const struct falcon_context * ctx); + + // Get the vocabulary as output parameters. + // Returns number of results. + LLAMA_API int falcon_get_vocab( + const struct falcon_context * ctx, + const char * * strings, + float * scores, + int capacity); + + // Token logits obtained from the last call to llama_eval() + // The logits for the last token are stored in the last row + // Can be mutated in order to change the probabilities of the next token + // Rows: n_tokens + // Cols: n_vocab + LLAMA_API float * falcon_get_logits(struct falcon_context * ctx); + + // Get the embeddings for the input + // shape: [n_embd] (1-dimensional) + LLAMA_API float * falcon_get_embeddings(struct falcon_context * ctx); + + // Token Id -> String. Uses the vocabulary in the provided context + LLAMA_API const char * falcon_token_to_str(const struct falcon_context * ctx, llama_token token); + + // Special tokens + LLAMA_API llama_token falcon_token_bos(); + LLAMA_API llama_token falcon_token_eos(); + LLAMA_API llama_token falcon_token_nl(); + + // Sampling functions + + /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. + LLAMA_API void llama_sample_repetition_penalty(struct falcon_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); + + /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. + LLAMA_API void llama_sample_frequency_and_presence_penalties(struct falcon_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + + /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. + LLAMA_API void llama_sample_softmax(struct falcon_context * ctx, llama_token_data_array * candidates); + + /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_API void llama_sample_top_k(struct falcon_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); + + /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_API void llama_sample_top_p(struct falcon_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); + + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + LLAMA_API void llama_sample_tail_free(struct falcon_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); + + /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. + LLAMA_API void llama_sample_typical(struct falcon_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); + LLAMA_API void llama_sample_temperature(struct falcon_context * ctx, llama_token_data_array * candidates, float temp); + + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_API llama_token llama_sample_token_mirostat(struct falcon_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); + + /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_API llama_token llama_sample_token_mirostat_v2(struct falcon_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); + + /// @details Selects the token with the highest probability. + LLAMA_API llama_token llama_sample_token_greedy(struct falcon_context * ctx, llama_token_data_array * candidates); + + /// @details Randomly selects a token from the candidates based on their probabilities. + LLAMA_API llama_token llama_sample_token(struct falcon_context * ctx, llama_token_data_array * candidates); + + // Performance information + LLAMA_API void falcon_print_timings(struct falcon_context * ctx); + LLAMA_API void llama_reset_timings(struct falcon_context * ctx); + + // Print system information + LLAMA_API const char * falcon_print_system_info(void); + +#ifdef __cplusplus +} +#endif + +// Internal API to be implemented by llama.cpp and used by tests/benchmarks only +#ifdef LLAMA_API_INTERNAL + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct falcon_context * ctx); + +#endif + +#endif