mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
🚀 Dockerize llamacpp (#132)
* feat: dockerize llamacpp * feat: split build & runtime stages * split dockerfile into main & tools * add quantize into tool docker image * Update .devops/tools.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * add docker action pipeline * change CI to publish at github docker registry * fix name runs-on macOS-latest is macos-latest (lowercase) * include docker versioned images * fix github action docker * fix docker.yml * feat: include all-in-one command tool & update readme.md --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
904d2a8d6a
commit
2af23d3043
17
.devops/full.Dockerfile
Normal file
17
.devops/full.Dockerfile
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install torch torchvision torchaudio sentencepiece numpy
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
18
.devops/main.Dockerfile
Normal file
18
.devops/main.Dockerfile
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
46
.devops/tools.sh
Executable file
46
.devops/tools.sh
Executable file
@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Read the first argument into a variable
|
||||||
|
arg1="$1"
|
||||||
|
|
||||||
|
# Shift the arguments to remove the first one
|
||||||
|
shift
|
||||||
|
|
||||||
|
# Join the remaining arguments into a single string
|
||||||
|
arg2="$@"
|
||||||
|
|
||||||
|
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||||
|
python3 ./convert-pth-to-ggml.py $arg2
|
||||||
|
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||||
|
./quantize $arg2
|
||||||
|
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||||
|
./main $arg2
|
||||||
|
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
||||||
|
python3 ./download-pth.py $arg2
|
||||||
|
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||||
|
echo "Downloading model..."
|
||||||
|
python3 ./download-pth.py "$1" "$2"
|
||||||
|
echo "Converting PTH to GGML..."
|
||||||
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
|
else
|
||||||
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
|
./quantize "$i" "${i/f16/q4_0}" 2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "Unknown command: $arg1"
|
||||||
|
echo "Available commands: "
|
||||||
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
|
||||||
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
|
echo " ex: \"/models/7B/\" 1"
|
||||||
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
fi
|
24
.dockerignore
Normal file
24
.dockerignore
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
*.o
|
||||||
|
*.a
|
||||||
|
.cache/
|
||||||
|
.vs/
|
||||||
|
.vscode/
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
build/
|
||||||
|
build-em/
|
||||||
|
build-debug/
|
||||||
|
build-release/
|
||||||
|
build-static/
|
||||||
|
build-no-accel/
|
||||||
|
build-sanitize-addr/
|
||||||
|
build-sanitize-thread/
|
||||||
|
|
||||||
|
models/*
|
||||||
|
|
||||||
|
/main
|
||||||
|
/quantize
|
||||||
|
|
||||||
|
arm_neon.h
|
||||||
|
compile_commands.json
|
||||||
|
Dockerfile
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -19,7 +19,7 @@ jobs:
|
|||||||
make
|
make
|
||||||
|
|
||||||
macOS-latest:
|
macOS-latest:
|
||||||
runs-on: macOS-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
61
.github/workflows/docker.yml
vendored
Normal file
61
.github/workflows/docker.yml
vendored
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
|
||||||
|
# GitHub recommends pinning actions to a commit SHA.
|
||||||
|
# To get a newer version, you will need to update the SHA.
|
||||||
|
# You can also reference a tag or branch, but the action may change without warning.
|
||||||
|
|
||||||
|
name: Publish Docker image
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push_to_registry:
|
||||||
|
name: Push Docker image to Docker Hub
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||||
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||||
|
steps:
|
||||||
|
- name: Check out the repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (versioned)
|
||||||
|
if: github.event_name == 'push'
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (tagged)
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name == 'push' }}
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
32
README.md
32
README.md
@ -32,6 +32,7 @@ Supported platforms:
|
|||||||
- [X] Mac OS
|
- [X] Mac OS
|
||||||
- [X] Linux
|
- [X] Linux
|
||||||
- [X] Windows (via CMake)
|
- [X] Windows (via CMake)
|
||||||
|
- [X] Docker
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
|
|||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
#### Prerequisites
|
||||||
|
* Docker must be installed and running on your system.
|
||||||
|
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
|
||||||
|
|
||||||
|
#### Images
|
||||||
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||||
|
|
||||||
|
#### Usage
|
||||||
|
|
||||||
|
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||||
|
```
|
||||||
|
|
||||||
|
On complete, you are ready to play!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
||||||
|
```
|
||||||
|
|
||||||
|
or with light image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
||||||
|
```
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
# At the start of the ggml file we write the model parameters
|
# At the start of the ggml file we write the model parameters
|
||||||
# and vocabulary.
|
# and vocabulary.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import struct
|
import struct
|
||||||
@ -64,6 +64,10 @@ if len(sys.argv) > 2:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
||||||
|
|
||||||
|
if os.path.exists(fname_out):
|
||||||
|
print(f"Skip conversion, it already exists: {fname_out}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
with open(fname_hparams, "r") as f:
|
with open(fname_hparams, "r") as f:
|
||||||
hparams = json.load(f)
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
66
download-pth.py
Normal file
66
download-pth.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: download-pth.py dir-model model-type\n")
|
||||||
|
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
modelsDir = sys.argv[1]
|
||||||
|
model = sys.argv[2]
|
||||||
|
|
||||||
|
num = {
|
||||||
|
"7B": 1,
|
||||||
|
"13B": 2,
|
||||||
|
"30B": 4,
|
||||||
|
"65B": 8,
|
||||||
|
}
|
||||||
|
|
||||||
|
if model not in num:
|
||||||
|
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Downloading model {model}")
|
||||||
|
|
||||||
|
files = ["checklist.chk", "params.json"]
|
||||||
|
|
||||||
|
for i in range(num[model]):
|
||||||
|
files.append(f"consolidated.0{i}.pth")
|
||||||
|
|
||||||
|
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||||
|
os.makedirs(resolved_path, exist_ok=True)
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
dest_path = os.path.join(resolved_path, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
||||||
|
|
||||||
|
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||||
|
for file in files2:
|
||||||
|
dest_path = os.path.join(modelsDir, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
Loading…
Reference in New Issue
Block a user