mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
7
GGML Tips & Tricks
Georgi Gerganov edited this page 2024-01-08 10:03:31 +02:00
Table of Contents
Measuring the performance of the inference
-
Build with
LLAMA_PERF
:make clean LLAMA_PERF=1 make
This adds
-DGGML_PERF
to the compile flags which enables the internalggml
performance timers
You will see output like this:
n_nodes = 1188
- 0: [ 4096, 1, 1] GET_ROWS ( 1) cpu = 0.019 / 0.019 ms, wall = 0.006 / 0.006 ms
- 1: [ 4096, 1, 1] RMS_NORM ( 1) cpu = 0.008 / 0.008 ms, wall = 0.008 / 0.008 ms
- 2: [ 4096, 1, 1] MUL ( 1) cpu = 0.001 / 0.001 ms, wall = 0.001 / 0.001 ms
- 3: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 0.814 / 0.814 ms, wall = 0.817 / 0.817 ms
- 4: [ 128, 32, 1] RESHAPE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms
- 5: [ 128, 32, 1] ROPE ( 1) cpu = 0.011 / 0.011 ms, wall = 0.011 / 0.011 ms
- 6: [ 4096, 1, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms
- 7: [ 4096, 1, 1] CPY ( 1) cpu = 0.004 / 0.004 ms, wall = 0.004 / 0.004 ms
- 8: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 3.273 / 3.273 ms, wall = 0.356 / 0.356 ms
- 9: [ 4096, 1, 1] RESHAPE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms
- 10: [ 1, 4096, 1] TRANSPOSE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms
- 11: [ 1, 4096, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms
- 12: [ 1, 4096, 1] CPY ( 1) cpu = 0.022 / 0.022 ms, wall = 0.023 / 0.023 ms
- 13: [ 17, 128, 32] VIEW ( 1) cpu = 0.001 / 0.001 ms, wall = 0.001 / 0.001 ms
- 14: [ 69632, 1, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms
- 15: [ 128, 32, 17] RESHAPE ( 1) cpu = 0.002 / 0.002 ms, wall = 0.000 / 0.000 ms
- 16: [ 128, 17, 32] PERMUTE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms
- 17: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 0.744 / 0.744 ms, wall = 0.246 / 0.246 ms
- 18: [ 128, 32, 1] RESHAPE ( 1) cpu = 0.001 / 0.001 ms, wall = 0.000 / 0.000 ms
...
perf_total_per_op_us[ NONE] = 0.000 ms
perf_total_per_op_us[ DUP] = 0.000 ms
perf_total_per_op_us[ ADD] = 0.339 ms
perf_total_per_op_us[ SUB] = 0.000 ms
perf_total_per_op_us[ MUL] = 0.271 ms
perf_total_per_op_us[ DIV] = 0.000 ms
perf_total_per_op_us[ SQR] = 0.000 ms
perf_total_per_op_us[ SQRT] = 0.000 ms
perf_total_per_op_us[ SUM] = 0.000 ms
perf_total_per_op_us[ MEAN] = 0.000 ms
perf_total_per_op_us[ REPEAT] = 0.000 ms
perf_total_per_op_us[ ABS] = 0.000 ms
perf_total_per_op_us[ SGN] = 0.000 ms
perf_total_per_op_us[ NEG] = 0.000 ms
perf_total_per_op_us[ STEP] = 0.000 ms
perf_total_per_op_us[ RELU] = 0.000 ms
perf_total_per_op_us[ GELU] = 0.000 ms
perf_total_per_op_us[ SILU] = 0.574 ms
perf_total_per_op_us[ NORM] = 0.000 ms
perf_total_per_op_us[ RMS_NORM] = 0.721 ms
perf_total_per_op_us[ MUL_MAT] = 95.358 ms
perf_total_per_op_us[ SCALE] = 0.166 ms
perf_total_per_op_us[ CPY] = 2.062 ms
perf_total_per_op_us[ RESHAPE] = 0.067 ms
perf_total_per_op_us[ VIEW] = 0.067 ms
perf_total_per_op_us[ PERMUTE] = 0.067 ms
perf_total_per_op_us[ TRANSPOSE] = 0.020 ms
perf_total_per_op_us[ GET_ROWS] = 0.007 ms
perf_total_per_op_us[ DIAG_MASK_INF] = 0.025 ms
perf_total_per_op_us[ SOFT_MAX] = 0.185 ms
perf_total_per_op_us[ ROPE] = 2.667 ms
perf_total_per_op_us[ CONV_1D_1S] = 0.000 ms
perf_total_per_op_us[ CONV_1D_2S] = 0.000 ms
perf_total_per_op_us[ FLASH_ATTN] = 0.000 ms
perf_total_per_op_us[ FLASH_FF] = 0.000 ms
- Generate graph plots
diff --git a/llama.cpp b/llama.cpp
index 3413288..7578bfa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2311,7 +2311,7 @@ static struct ggml_cgraph * llm_build_llama(
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- for (int il = 0; il < n_layer; ++il) {
+ for (int il = 0; il < 1; ++il) {
ggml_format_name(inpL, "layer_inp_%d", il);
offload_func_t offload_func = llama_nop;
@@ -2993,9 +2993,10 @@ static bool llama_eval_internal(
#endif
// plot the computation graph in dot format (for debugging purposes)
- //if (n_past%100 == 0) {
- // ggml_graph_dump_dot(gf, NULL, "llama.dot");
- //}
+ //if (N == 7) {
+ if (n_past%45 == 0) {
+ ggml_graph_dump_dot(gf, NULL, "llama.dot");
+ }
// extract logits
{
Note: n_past
is now replaced with batch.pos[]
- LLaMAv2 7B,
n_past == 45
,n_batch == 1
- LLaMAv2 7B,
n_past == 0
,n_batch == 7
- LLaMAv2 7B,
n_past == 4
,n_batch == 3
Users Guide
Useful information for users that doesn't fit into Readme.
Technical Details
These are information useful for Maintainers and Developers which does not fit into code comments
Github Actions Main Branch Status
Click on a badge to jump to workflow. This is here as a useful general view of all the actions so that we may notice quicker if main branch automation is broken and where.