# Benchmark name: Benchmark on: workflow_dispatch: inputs: gpu-series: description: 'Azure GPU series to run with' required: true type: choice options: - Standard_NC4as_T4_v3 - Standard_NC24ads_A100_v4 - Standard_NC80adis_H100_v5 sha: description: 'Commit SHA1 to build' required: false type: string duration: description: 'Duration of the bench' type: string default: 10m push: branches: - master paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' concurrency: group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }} cancel-in-progress: true jobs: bench-server-baseline: runs-on: Standard_NC4as_T4_v3 env: RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it N_USERS: 8 DURATION: 10m strategy: matrix: model: [phi-2] ftype: [q4_0, q8_0, f16] include: - model: phi-2 ftype: q4_0 pr_comment_enabled: "true" if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} steps: - name: Clone id: checkout uses: actions/checkout@v4 with: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Install python env id: pipenv run: | cd examples/server/bench python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - name: Prometheus id: install_prometheus run: | wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz tar xzf prometheus*.tar.gz --strip-components=1 ./prometheus --config.file=examples/server/bench/prometheus.yml & while ! nc -z localhost 9090; do sleep 0.1 done - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.21' - name: Install k6 and xk6-sse id: k6_installation run: | cd examples/server/bench go install go.k6.io/xk6/cmd/xk6@latest xk6 build master \ --with github.com/phymbert/xk6-sse - name: Build id: cmake_build run: | set -eux mkdir build cd build cmake .. \ -DLLAMA_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DLLAMA_CUBLAS=ON \ -DCUDAToolkit_ROOT=/usr/local/cuda \ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=75 \ -DLLAMA_FATAL_WARNINGS=OFF \ -DLLAMA_ALL_WARNINGS=OFF \ -DCMAKE_BUILD_TYPE=Release; cmake --build . --config Release -j $(nproc) --target server - name: Download the dataset id: download_dataset run: | cd examples/server/bench wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - name: Server bench id: server_bench run: | set -eux cd examples/server/bench source venv/bin/activate python bench.py \ --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ --branch ${{ github.head_ref || github.ref_name }} \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ --duration ${{ github.event.inputs.duration || env.DURATION }} \ --hf-repo ggml-org/models \ --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ --model-path-prefix /models \ --parallel ${{ env.N_USERS }} \ -ngl 33 \ --batch-size 2048 \ --ubatch-size 256 \ --ctx-size 16384 \ --n-prompts 1000 \ --max-prompt-tokens 1024 \ --max-tokens 2048 cat results.github.env >> $GITHUB_ENV # Remove dataset as we do not want it in the artefact rm ShareGPT_V3_unfiltered_cleaned_split.json - uses: actions/upload-artifact@v4 with: name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} compression-level: 9 path: | examples/server/bench/*.jpg examples/server/bench/*.json examples/server/bench/*.log - name: Commit status uses: Sibz/github-status-action@v1 with: authToken: ${{secrets.GITHUB_TOKEN}} sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} description: | ${{ env.BENCH_RESULTS }} state: 'success' - name: Upload benchmark images uses: devicons/public-upload-to-imgur@v2.2.2 continue-on-error: true # Important as it looks unstable: 503 id: imgur_step with: client_id: ${{secrets.IMGUR_CLIENT_ID}} path: | examples/server/bench/prompt_tokens_seconds.jpg examples/server/bench/predicted_tokens_seconds.jpg examples/server/bench/kv_cache_usage_ratio.jpg examples/server/bench/requests_processing.jpg - name: Extract mermaid id: set_mermaid run: | set -eux cd examples/server/bench PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV REQUESTS_PROCESSING=$(cat requests_processing.mermaid) echo "REQUESTS_PROCESSING<> $GITHUB_ENV echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - name: Extract image url id: extract_image_url continue-on-error: true run: | set -eux echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV - name: Comment PR uses: mshick/add-pr-comment@v2 id: comment_pr if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} with: message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} message: |

📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀

Expand details for performance related PR only - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - ${{ env.BENCH_GRAPH_XLABEL }}

prompt_tokens_seconds

More ```mermaid ${{ env.PROMPT_TOKENS_SECONDS }} ```
predicted_tokens_seconds
More ```mermaid ${{ env.PREDICTED_TOKENS_SECONDS }} ```

Details

kv_cache_usage_ratio

More ```mermaid ${{ env.KV_CACHE_USAGE_RATIO }} ```
requests_processing
More ```mermaid ${{ env.REQUESTS_PROCESSING }} ```