mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
ci: bench: add more ftype, fix triggers and bot comment (#6466)
* ci: bench: change trigger path to not spawn on each PR * ci: bench: add more file type for phi-2: q8_0 and f16. - do not show the comment by default * ci: bench: add seed parameter in k6 script * ci: bench: artefact name perf job * Add iteration in the commit status, reduce again the autocomment * ci: bench: add per slot metric in the commit status * Fix trailing spaces
This commit is contained in:
parent
4bcd6b959c
commit
7a2c92637a
37
.github/workflows/bench.yml
vendored
37
.github/workflows/bench.yml
vendored
@ -24,10 +24,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
@ -42,6 +42,16 @@ jobs:
|
|||||||
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
||||||
N_USERS: 8
|
N_USERS: 8
|
||||||
DURATION: 10m
|
DURATION: 10m
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
model: [phi-2]
|
||||||
|
ftype: [q4_0, q8_0, f16]
|
||||||
|
include:
|
||||||
|
- model: phi-2
|
||||||
|
ftype: q4_0
|
||||||
|
pr_comment_enabled: "true"
|
||||||
|
|
||||||
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
|
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -116,7 +126,7 @@ jobs:
|
|||||||
--scenario script.js \
|
--scenario script.js \
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
|
||||||
--model-path-prefix /models \
|
--model-path-prefix /models \
|
||||||
--parallel ${{ env.N_USERS }} \
|
--parallel ${{ env.N_USERS }} \
|
||||||
-ngl 33 \
|
-ngl 33 \
|
||||||
@ -134,7 +144,7 @@ jobs:
|
|||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: benchmark-results
|
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||||
compression-level: 9
|
compression-level: 9
|
||||||
path: |
|
path: |
|
||||||
examples/server/bench/*.jpg
|
examples/server/bench/*.jpg
|
||||||
@ -146,7 +156,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
authToken: ${{secrets.GITHUB_TOKEN}}
|
authToken: ${{secrets.GITHUB_TOKEN}}
|
||||||
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
||||||
context: bench-server-baseline
|
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||||
description: |
|
description: |
|
||||||
${{ env.BENCH_RESULTS }}
|
${{ env.BENCH_RESULTS }}
|
||||||
state: 'success'
|
state: 'success'
|
||||||
@ -203,11 +213,19 @@ jobs:
|
|||||||
- name: Comment PR
|
- name: Comment PR
|
||||||
uses: mshick/add-pr-comment@v2
|
uses: mshick/add-pr-comment@v2
|
||||||
id: comment_pr
|
id: comment_pr
|
||||||
if: ${{ github.event.pull_request != '' }}
|
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
|
||||||
with:
|
with:
|
||||||
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
|
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||||
message: |
|
message: |
|
||||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
<p align="center">
|
||||||
|
|
||||||
|
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
||||||
|
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
|
||||||
|
<summary>Expand details for performance related PR only</summary>
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||||
@ -215,9 +233,6 @@ jobs:
|
|||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
- ${{ env.BENCH_GRAPH_XLABEL }}
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Time series</summary>
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ import matplotlib
|
|||||||
import matplotlib.dates
|
import matplotlib.dates
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import requests
|
import requests
|
||||||
|
from statistics import mean
|
||||||
|
|
||||||
|
|
||||||
def main(args_in: list[str] | None = None) -> None:
|
def main(args_in: list[str] | None = None) -> None:
|
||||||
@ -109,6 +110,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
|
|
||||||
# Prometheus
|
# Prometheus
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
prometheus_metrics = {}
|
||||||
if is_server_listening("0.0.0.0", 9090):
|
if is_server_listening("0.0.0.0", 9090):
|
||||||
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
|
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
|
||||||
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
|
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
|
||||||
@ -127,6 +129,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||||||
values = metric_data['data']['result'][0]['values']
|
values = metric_data['data']['result'][0]['values']
|
||||||
timestamps, metric_values = zip(*values)
|
timestamps, metric_values = zip(*values)
|
||||||
metric_values = [float(value) for value in metric_values]
|
metric_values = [float(value) for value in metric_values]
|
||||||
|
prometheus_metrics[metric] = metric_values
|
||||||
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
|
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
|
||||||
plt.figure(figsize=(16, 10), dpi=80)
|
plt.figure(figsize=(16, 10), dpi=80)
|
||||||
plt.plot(timestamps_dt, metric_values, label=metric)
|
plt.plot(timestamps_dt, metric_values, label=metric)
|
||||||
@ -176,17 +179,20 @@ xychart-beta
|
|||||||
|
|
||||||
# 140 chars max for commit status description
|
# 140 chars max for commit status description
|
||||||
bench_results = {
|
bench_results = {
|
||||||
|
"i": iterations,
|
||||||
"req": {
|
"req": {
|
||||||
"p90": data['metrics']["http_req_duration"]["p(90)"],
|
"p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
|
||||||
"avg": data['metrics']["http_req_duration"]["avg"],
|
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
|
||||||
},
|
},
|
||||||
"pp": {
|
"pp": {
|
||||||
"p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
|
"p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
|
||||||
"avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
|
"avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
|
||||||
|
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
||||||
},
|
},
|
||||||
"tg": {
|
"tg": {
|
||||||
"p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
|
"p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
|
||||||
"avg": data['metrics']["llamacpp_tokens_second"]["avg"],
|
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
||||||
|
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
with open("results.github.env", 'a') as github_env:
|
with open("results.github.env", 'a') as github_env:
|
||||||
|
@ -87,6 +87,7 @@ export default function () {
|
|||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"stream": false,
|
"stream": false,
|
||||||
|
"seed": 42,
|
||||||
"max_tokens": max_tokens
|
"max_tokens": max_tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user