mirror of https://github.com/ggerganov/llama.cpp.git synced 2024-12-25 02:44:36 +00:00

History

0cc4m 2307523d32 ggml : add Vulkan backend (#2059 ) * Vulkan loader code * Fix matmul kernel, continue implementation * Continue implementation * Vulkan memory management * Vulkan development * Matmul call * Add aligned malloc and free for VMA * Continue implementation * First matmul success * GEMM Kernel optimization * 1D Blocktiling * 2D Blocktiling * Write coalescing * Continue vulkan implementation and optimization * First FP16 attempt, disabled for now * Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel * Enable device extensions properly, restore fp16 matmul op * Fix mulmat_f16 * Output FP32 in fp16 matmul shader * Fix f16_to_f32 kernel * dequant_q4_0 kernel * Add VMA library * Avoid requesting dedicated memory, VMA can decide that by itself * Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly * add cmake commands * Add 2d write operation, profiling code * Fix 2d write * Fix queue selection for AMD RADV * Fix trailing whitespace in vk_mem_alloc.h * Add WIP warp tile mat mul shaders * Disable glslc optimization * Disable glslc optimization for CMake * Optimize warptile matmul shader, replace blocktile with it * Add split-k optimization for small matrix multiplication Use semaphores for synchronization instead of fences or waitidle Rework async write/read for synchronization * Fix validation errors, improve compatibility with AMD GPUs * Rework command buffer handling * Variable matmul kernel using specialization constants * Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints * Reuse semaphores * Handle stage flags during command buffer submission properly * Increase matmul test runs for consistent results * Fix F32 matmul * Add vectorized loading and zeropadding for matrix multiplication * Use pinned memory for f16 preprocessing * Don't force aligned matmul * Don't free before queue done * Replace VMA library with native Vulkan buffer management * Basic offloading support with mul_f32 and dmmv for q4_0 * Run glslc commands in parallel * Unroll loops in dmmv shader * Reduce usage of waitIdle * Reuse pinned allocation for f16 conversion * Handle devices with only a single queue * Fix trailing whitespace in CMakeLists.txt * Allow parallel execution of kernels, parallelize third and fourth dimension calls * Add fallback for devices only supporting one DescriptorSet per DescriptorPool * Move to graph function similar to CUDA implementation * Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function * Add F32 dmmv shaders * Batch submissions * Add .spv to gitignore * Split off matrix vector multiplication for separate optimization * Use single command buffer for matrix vector multiplication ops * Reduce overhead of mul_f32 calls by using a single command buffer * Add submission batching to mul_f32 * Fix tests * Add missing barrier * Add further missing barrier * Add further ops * Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions * Remove unnecessary cblas link * Fix descriptor set pre-allocation assert * Add runtime shader compilation, start transferring shaders to this approach * Transfer remaining shaders to header and compile on runtime * Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16 * Add support for q4_1, q5_0, q5_1 and q8_0 * Remove unnecessary scalar layout extension * Parse graph early to pre-record command buffers * Add q6_k support * Add multi-submit for command buffers * Fix q6_k dequant shader for AMD * Fix q6_k for GPUs without fp16 support * Simplify q6_k fp16 fix * Minor fixes * Fix wg_denom of m-mulmat shaders * Add Python-based Vulkan shader generator * Replace shaderc dependency with precompiled shaders Fix python script to generate shaders * Clean up code * Fix shader generator script Windows compatibility Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> * Close file before deletion * Fix vulkan shader fp32 name * Add q2_k and q3_k support Add validation check to compare shader results to cpu results * Add q4_k support * Add q5_k support * Bake SPIR-V bytecode into the library instead of loading shaders from file * Switch to signal semaphores for flexibility Prepare broadcasting support for mul mat * Finish broadcasting mul mat support for GQA * Clean up unused functions Add repeat op * Add further ops, not yet enabled. Improve semaphore code * Reduce number of used semaphores by utilizing timelines more properly * Remove queue information * Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations * Add Vulkan to llama-bench * Remove cblas dependency * Fix matmul k-split bug * Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader * Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug * Fix issues with float16 overflows in shaders * Fix issues with older Vulkan headers on Ubuntu 22.04 * Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers * Implement further ops, rework op_f32 calls, fix bugs * Finish full offloading support, add last remaining ops, fix bugs, remove redundant code * Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders * Merge upstream changes, fix conflicts, adapt soft_max op * Fix Python and shader header format * Free model gpu buffers on exit * Use single queue per device to simplify code * Add matmul shader support for running multiple calculations in parallel * Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible * Fix missing event cast * Replace uint64_t(-1) with UINT64_MAX, rename function for clarity * Fix warning about empty C function parameters * Fix compiler warnings * Properly implement Vulkan backend buffer handling * Fix oversized host staging buffers * Simplify barrier synchronization calls * Fix gcc warnings * Implement max_size for backend buffer types to limit the size of a single allocation * Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size * refactor multi buf * Disable unsupported ops to fix tests * Check for maintenance4 support before using it * Handle devices with only a single queue * Fix single queue logic * propagate buffer usage in multi buffers * Implement rope_neox op * Cleanup header and other files * Simplify gpu_extras by removing events and putting staging memcpys into contexts * Move queue into context Add not-yet-enabled async backend ops * Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization * Add get_max_size to SYCL backend. Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * llama : fix trailing whitespace --------- Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>		2024-01-28 19:03:59 +02:00
..
CMakeLists.txt	build : link against build info instead of compiling against it (#3879 )	2023-11-02 08:50:16 +02:00
llama-bench.cpp	ggml : add Vulkan backend (#2059 )	2024-01-28 19:03:59 +02:00
README.md	llama-bench : add README (#3317 )	2023-09-23 21:48:24 +02:00

README.md

llama.cpp/example/llama-bench

Performance testing tool for llama.cpp.

Syntax
Examples
Output formats
1. Markdown
2. CSV
3. JSON
4. SQL

Syntax

usage: ./llama-bench [options]

options:
  -h, --help
  -m, --model <filename>            (default: models/7B/ggml-model-q4_0.gguf)
  -p, --n-prompt <n>                (default: 512)
  -n, --n-gen <n>                   (default: 128)
  -b, --batch-size <n>              (default: 512)
  --memory-f32 <0|1>                (default: 0)
  -t, --threads <n>                 (default: 16)
  -ngl N, --n-gpu-layers <n>        (default: 99)
  -mg i, --main-gpu <i>             (default: 0)
  -mmq, --mul-mat-q <0|1>           (default: 1)
  -ts, --tensor_split <ts0/ts1/..>
  -r, --repetitions <n>             (default: 5)
  -o, --output <csv|json|md|sql>    (default: md)
  -v, --verbose                     (default: 0)

Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.

llama-bench can perform two types of tests:

Prompt processing (pp): processing a prompt in batches (-p)
Text generation (tg): generating a sequence of tokens (-n)

With the exception of -r, -o and -v, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. -n 16,32), or the option can be specified multiple times (e.g. -n 16 -n 32).

Each test is repeated the number of times given by -r, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.

For a description of the other options, see the main example.

Examples

Text generation with different models

$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512

model	size	params	backend	ngl	test	t/s
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	tg 128	132.19 ± 0.55
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	tg 256	129.37 ± 0.54
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	tg 512	123.83 ± 0.25
llama 13B mostly Q4_0	6.86 GiB	13.02 B	CUDA	99	tg 128	82.17 ± 0.31
llama 13B mostly Q4_0	6.86 GiB	13.02 B	CUDA	99	tg 256	80.74 ± 0.23
llama 13B mostly Q4_0	6.86 GiB	13.02 B	CUDA	99	tg 512	78.08 ± 0.07

Prompt processing with different batch sizes

$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024

model	size	params	backend	ngl	n_batch	test	t/s
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	128	pp 1024	1436.51 ± 3.66
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	256	pp 1024	1932.43 ± 23.48
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	512	pp 1024	2254.45 ± 15.59
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	1024	pp 1024	2498.61 ± 13.58

Different numbers of threads

$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32

model	size	params	backend	threads	test	t/s
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	1	pp 64	6.17 ± 0.07
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	1	tg 16	4.05 ± 0.02
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	2	pp 64	12.31 ± 0.13
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	2	tg 16	7.80 ± 0.07
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	4	pp 64	23.18 ± 0.06
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	4	tg 16	12.22 ± 0.07
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	8	pp 64	32.29 ± 1.21
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	8	tg 16	16.71 ± 0.66
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	16	pp 64	33.52 ± 0.03
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	16	tg 16	15.32 ± 0.05
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	32	pp 64	59.00 ± 1.11
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CPU	32	tg 16	16.41 ± 0.79

Different numbers of layers offloaded to the GPU

$ ./llama-bench -ngl 10,20,30,31,32,33,34,35

model	size	params	backend	ngl	test	t/s
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	10	pp 512	373.36 ± 2.25
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	10	tg 128	13.45 ± 0.93
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	20	pp 512	472.65 ± 1.25
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	20	tg 128	21.36 ± 1.94
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	30	pp 512	631.87 ± 11.25
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	30	tg 128	40.04 ± 1.82
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	31	pp 512	657.89 ± 5.08
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	31	tg 128	48.19 ± 0.81
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	32	pp 512	688.26 ± 3.29
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	32	tg 128	54.78 ± 0.65
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	33	pp 512	704.27 ± 2.24
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	33	tg 128	60.62 ± 1.76
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	34	pp 512	881.34 ± 5.40
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	34	tg 128	71.76 ± 0.23
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	35	pp 512	2400.01 ± 7.72
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	35	tg 128	131.66 ± 0.49

Output formats

By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the -o option.

Markdown

$ ./llama-bench -o md

model	size	params	backend	ngl	test	t/s
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	pp 512	2368.80 ± 93.24
llama 7B mostly Q4_0	3.56 GiB	6.74 B	CUDA	99	tg 128	131.42 ± 0.59

CSV

$ ./llama-bench -o csv

build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"

JSON

$ ./llama-bench -o json

[
  {
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
    "model_filename": "models/7B/ggml-model-q4_0.gguf",
    "model_type": "llama 7B mostly Q4_0",
    "model_size": 3825065984,
    "model_n_params": 6738415616,
    "n_batch": 512,
    "n_threads": 16,
    "f16_kv": true,
    "n_gpu_layers": 99,
    "main_gpu": 0,
    "mul_mat_q": true,
    "tensor_split": "0.00",
    "n_prompt": 512,
    "n_gen": 0,
    "test_time": "2023-09-23T12:09:57Z",
    "avg_ns": 212365953,
    "stddev_ns": 985423,
    "avg_ts": 2410.974041,
    "stddev_ts": 11.163766,
    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
  },
  {
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
    "model_filename": "models/7B/ggml-model-q4_0.gguf",
    "model_type": "llama 7B mostly Q4_0",
    "model_size": 3825065984,
    "model_n_params": 6738415616,
    "n_batch": 512,
    "n_threads": 16,
    "f16_kv": true,
    "n_gpu_layers": 99,
    "main_gpu": 0,
    "mul_mat_q": true,
    "tensor_split": "0.00",
    "n_prompt": 0,
    "n_gen": 128,
    "test_time": "2023-09-23T12:09:59Z",
    "avg_ns": 977425219,
    "stddev_ns": 9268593,
    "avg_ts": 130.965708,
    "stddev_ts": 1.238924,
    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
  }
]

SQL

SQL output is suitable for importing into a SQLite database. The output can be piped into the sqlite3 command line tool to add the results to a database.

$ ./llama-bench -o sql

CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
  cuda INTEGER,
  opencl INTEGER,
  metal INTEGER,
  gpu_blas INTEGER,
  blas INTEGER,
  cpu_info TEXT,
  gpu_info TEXT,
  model_filename TEXT,
  model_type TEXT,
  model_size INTEGER,
  model_n_params INTEGER,
  n_batch INTEGER,
  n_threads INTEGER,
  f16_kv INTEGER,
  n_gpu_layers INTEGER,
  main_gpu INTEGER,
  mul_mat_q INTEGER,
  tensor_split TEXT,
  n_prompt INTEGER,
  n_gen INTEGER,
  test_time TEXT,
  avg_ns INTEGER,
  stddev_ns INTEGER,
  avg_ts REAL,
  stddev_ts REAL
);

INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');