mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
Compare commits
15 Commits
795ac0975b
...
3229228b8e
Author | SHA1 | Date | |
---|---|---|---|
|
3229228b8e | ||
|
32d6ee6385 | ||
|
14b699ecde | ||
|
485dc01214 | ||
|
86bf31cfe6 | ||
|
b92a14a841 | ||
|
6f0c9e034b | ||
|
dab76c92cc | ||
|
7024d59e6a | ||
|
7c0e285858 | ||
|
7ae33a616f | ||
|
ebdee9478c | ||
|
14f64dab74 | ||
|
f70b5148e1 | ||
|
de1bb5a4ad |
81
.devops/cpu.Dockerfile
Normal file
81
.devops/cpu.Dockerfile
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
94
.devops/cuda.Dockerfile
Normal file
94
.devops/cuda.Dockerfile
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
|
||||||
cp build/bin/* .
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
|
||||||
cp build/bin/* .
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,50 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt /app/requirements.txt
|
|
||||||
COPY requirements /app/requirements
|
|
||||||
COPY .devops/tools.sh /app/tools.sh
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel && \
|
|
||||||
pip install -r /app/requirements.txt
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/ /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
COPY --from=build /app/convert_hf_to_gguf.py /app/
|
|
||||||
COPY --from=build /app/gguf-py /app/gguf-py
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
91
.devops/intel.Dockerfile
Normal file
91
.devops/intel.Dockerfile
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
## Build Image
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" \
|
||||||
|
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
echo "Building with dynamic libs" && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,28 +0,0 @@
|
|||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with static libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
|
||||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,38 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,45 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -1,27 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
|
||||||
|
|
||||||
# Install Vulkan SDK
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,29 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -1,43 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default CUDA archs if not specified
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,34 +0,0 @@
|
|||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,43 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,54 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102"
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -1,31 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,33 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build build -j $(nproc) && \
|
|
||||||
mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib/ \;
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /app/
|
|
||||||
COPY --from=build /app/lib/ /app/
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
108
.devops/musa.Dockerfile
Normal file
108
.devops/musa.Dockerfile
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG MUSA_VERSION=rc3.1.0
|
||||||
|
# Target the MUSA build image
|
||||||
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# MUSA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG MUSA_DOCKER_ARCH=default
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
git \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Use the default MUSA archs if not specified
|
||||||
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
113
.devops/rocm.Dockerfile
Normal file
113
.devops/rocm.Dockerfile
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
ARG UBUNTU_VERSION=24.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=6.3
|
||||||
|
ARG AMDGPU_VERSION=6.3
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
### Build image
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||||
|
# gfx906 is deprecated
|
||||||
|
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
||||||
|
|
||||||
|
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||||
|
ARG ROCM_DOCKER_ARCH=gfx1100
|
||||||
|
|
||||||
|
# Set nvcc architectured
|
||||||
|
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
# ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
git \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
curl \
|
||||||
|
libgomp1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||||
|
&& cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib \
|
||||||
|
&& find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3-pip \
|
||||||
|
python3 \
|
||||||
|
python3-wheel\
|
||||||
|
&& pip install --break-system-packages --upgrade setuptools \
|
||||||
|
&& pip install --break-system-packages -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
88
.devops/vulkan.Dockerfile
Normal file
88
.devops/vulkan.Dockerfile
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK and cURL
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/bin/* /app/full \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base image
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
### Full
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
104
.github/workflows/docker.yml
vendored
104
.github/workflows/docker.yml
vendored
@ -34,21 +34,14 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Multi-stage build
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||||
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
||||||
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@ -56,10 +49,10 @@ jobs:
|
|||||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v2
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v2
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
@ -79,25 +72,34 @@ jobs:
|
|||||||
|
|
||||||
# determine tag name postfix (build number, commit hash)
|
# determine tag name postfix (build number, commit hash)
|
||||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||||
TAG_POSTFIX="b${BUILD_NUMBER}"
|
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
||||||
else
|
else
|
||||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||||
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
|
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# list all tags possible
|
# list all tags possible
|
||||||
TAGS=""
|
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
|
TYPE=""
|
||||||
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
|
else
|
||||||
|
TYPE="-${{ matrix.config.tag }}"
|
||||||
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
|
fi
|
||||||
echo "output_tags=$TAGS" # print out for debugging
|
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||||
|
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||||
|
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||||
|
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||||
|
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||||
|
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
||||||
|
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||||
|
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||||
env:
|
env:
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
|
if: ${{ matrix.config.free_disk_space == true }}
|
||||||
uses: jlumbroso/free-disk-space@main
|
uses: jlumbroso/free-disk-space@main
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
@ -113,13 +115,59 @@ jobs:
|
|||||||
docker-images: true
|
docker-images: true
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged + versioned)
|
- name: Build and push Full Docker image (tagged + versioned)
|
||||||
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
# tag list is generated from step above
|
||||||
tags: ${{ steps.tag.outputs.output_tags }}
|
tags: ${{ steps.tag.outputs.full_output_tags }}
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: full
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
|
||||||
|
- name: Build and push Light Docker image (tagged + versioned)
|
||||||
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
# tag list is generated from step above
|
||||||
|
tags: ${{ steps.tag.outputs.light_output_tags }}
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: light
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
|
||||||
|
- name: Build and push Server Docker image (tagged + versioned)
|
||||||
|
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
# tag list is generated from step above
|
||||||
|
tags: ${{ steps.tag.outputs.server_output_tags }}
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
target: server
|
||||||
|
provenance: false
|
||||||
|
# using github experimental cache
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
# return to this if the experimental github cache is having issues
|
||||||
|
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||||
|
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
|
@ -529,9 +529,19 @@ class Model:
|
|||||||
else:
|
else:
|
||||||
token: str = reverse_vocab[i]
|
token: str = reverse_vocab[i]
|
||||||
if token in added_vocab:
|
if token in added_vocab:
|
||||||
|
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
||||||
|
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
||||||
|
if not tokenizer.added_tokens_decoder[i].normalized:
|
||||||
|
previous_token = token
|
||||||
|
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||||
|
if previous_token != token:
|
||||||
|
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||||
|
|
||||||
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
else:
|
else:
|
||||||
|
# NOTE: this was added for Gemma.
|
||||||
|
# Encoding and decoding the tokens above isn't sufficient for this case.
|
||||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
else:
|
else:
|
||||||
@ -575,6 +585,9 @@ class Model:
|
|||||||
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
||||||
# ref: https://huggingface.co/tiiuae/falcon-7b
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
||||||
res = "falcon"
|
res = "falcon"
|
||||||
|
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||||
|
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
||||||
|
res = "falcon3"
|
||||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
||||||
res = "bert-bge"
|
res = "bert-bge"
|
||||||
@ -671,6 +684,9 @@ class Model:
|
|||||||
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
||||||
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
||||||
res = "gigachat"
|
res = "gigachat"
|
||||||
|
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
||||||
|
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
||||||
|
res = "megrez"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -1679,6 +1695,184 @@ class LlamaModel(Model):
|
|||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeciLMForCausalLM")
|
||||||
|
class DeciModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DECI
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
||||||
|
# DeciLM-specific code
|
||||||
|
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
||||||
|
return DeciModel._find_multiple(intermediate_size, 256)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_multiple(n: int, k: int) -> int:
|
||||||
|
# DeciLM-specific code
|
||||||
|
if n % k == 0:
|
||||||
|
return n
|
||||||
|
return n + k - (n % k)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
||||||
|
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
||||||
|
assert self.block_count == len(_block_configs)
|
||||||
|
self._num_kv_heads = list()
|
||||||
|
self._num_heads = list()
|
||||||
|
_ffn_multipliers = list()
|
||||||
|
# ***linear attention layer***
|
||||||
|
# if n_heads_in_group is None and replace_with_linear is True
|
||||||
|
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
||||||
|
# ***attention-free layer***
|
||||||
|
# if n_heads_in_group is None and replace_with_linear is False
|
||||||
|
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
||||||
|
# ***normal attention-layer***
|
||||||
|
# if n_heads_in_group is not None, then
|
||||||
|
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
||||||
|
# _num_heads[il] is num_attention_head
|
||||||
|
for il in range(len(_block_configs)):
|
||||||
|
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
||||||
|
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
||||||
|
self._num_kv_heads.append(0)
|
||||||
|
self._num_heads.append(self.hparams["num_attention_heads"])
|
||||||
|
else:
|
||||||
|
self._num_kv_heads.append(0)
|
||||||
|
self._num_heads.append(0)
|
||||||
|
else:
|
||||||
|
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
||||||
|
self._num_heads.append(self.hparams["num_attention_heads"])
|
||||||
|
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
assert self.block_count == len(self._num_heads)
|
||||||
|
assert self.block_count == len(_ffn_multipliers)
|
||||||
|
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
||||||
|
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
||||||
|
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
||||||
|
self._ffn_dims: list[int] = [
|
||||||
|
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
||||||
|
for multiplier in _ffn_multipliers
|
||||||
|
]
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
||||||
|
# eos_token from '|eot_id|' to '|end_of_text|'
|
||||||
|
if self.hparams.get("vocab_size", 128256) == 128256:
|
||||||
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(
|
||||||
|
self.dir_model, load_merges=True,
|
||||||
|
special_token_types = ['bos', 'eos', 'eom', 'eot']
|
||||||
|
)
|
||||||
|
special_vocab._set_special_token("bos", 128000)
|
||||||
|
special_vocab._set_special_token("eos", 128001)
|
||||||
|
special_vocab._set_special_token("eom", 128008)
|
||||||
|
special_vocab._set_special_token("eot", 128009)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
else:
|
||||||
|
# DeciLM-7B
|
||||||
|
self._set_vocab_llama_hf()
|
||||||
|
# self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
assert self.block_count == len(self._num_heads)
|
||||||
|
assert self.block_count == len(self._ffn_dims)
|
||||||
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
else: # DeciLM-7B
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
||||||
|
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
||||||
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
|
hparams = self.hparams
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
|
if "head_dim" in hparams:
|
||||||
|
rope_dim = hparams["head_dim"]
|
||||||
|
else:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
n_head = n_head_kv
|
||||||
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
if bid is not None:
|
||||||
|
if "num_key_value_heads_per_layer" in self.hparams:
|
||||||
|
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
||||||
|
elif "block_configs" in self.hparams:
|
||||||
|
n_kv_head = self._num_kv_heads[bid]
|
||||||
|
n_head = self._num_heads[bid]
|
||||||
|
else:
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
else:
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
|
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
||||||
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
||||||
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
||||||
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
high_freq_wavelen = old_context_len / high_freq_factor
|
||||||
|
assert low_freq_wavelen != high_freq_wavelen
|
||||||
|
|
||||||
|
rope_factors = []
|
||||||
|
for freq in freqs:
|
||||||
|
wavelen = 2 * math.pi / freq
|
||||||
|
if wavelen < high_freq_wavelen:
|
||||||
|
rope_factors.append(1)
|
||||||
|
elif wavelen > low_freq_wavelen:
|
||||||
|
rope_factors.append(factor)
|
||||||
|
else:
|
||||||
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BitnetForCausalLM")
|
@Model.register("BitnetForCausalLM")
|
||||||
class BitnetModel(Model):
|
class BitnetModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BITNET
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
@ -72,6 +72,7 @@ models = [
|
|||||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||||
|
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
|
||||||
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
||||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||||
@ -105,6 +106,7 @@ models = [
|
|||||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||||
|
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -134,6 +134,12 @@ This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA t
|
|||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Using `CMake` with path :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -rf build & /usr/local/bin/cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
|
||||||
|
/usr/local/bin/cmake --build build --config Release -j
|
||||||
|
```
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
||||||
|
|
||||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
||||||
|
@ -12,6 +12,10 @@
|
|||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# include <windows.h>
|
# include <windows.h>
|
||||||
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
|
|||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
||||||
}
|
}
|
||||||
|
#elif GGML_USE_SYCL
|
||||||
|
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
||||||
|
backend = ggml_backend_sycl_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// if there aren't GPU Backends fallback to CPU backend
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|||||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||||
#elif GGML_USE_VULKAN
|
#elif GGML_USE_VULKAN
|
||||||
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
||||||
|
#elif GGML_USE_SYCL
|
||||||
|
ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
|
||||||
#else
|
#else
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
MEMORYSTATUSEX status;
|
MEMORYSTATUSEX status;
|
||||||
|
@ -19,6 +19,8 @@ Options:
|
|||||||
Context size (default: 2048)
|
Context size (default: 2048)
|
||||||
-n, --ngl <value>
|
-n, --ngl <value>
|
||||||
Number of GPU layers (default: 0)
|
Number of GPU layers (default: 0)
|
||||||
|
--temp <value>
|
||||||
|
Temperature (default: 0.8)
|
||||||
-v, --verbose, --log-verbose
|
-v, --verbose, --log-verbose
|
||||||
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
|
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
|
||||||
-h, --help
|
-h, --help
|
||||||
|
@ -55,29 +55,51 @@ static int printe(const char * fmt, ...) {
|
|||||||
class Opt {
|
class Opt {
|
||||||
public:
|
public:
|
||||||
int init(int argc, const char ** argv) {
|
int init(int argc, const char ** argv) {
|
||||||
|
ctx_params = llama_context_default_params();
|
||||||
|
model_params = llama_model_default_params();
|
||||||
|
context_size_default = ctx_params.n_batch;
|
||||||
|
ngl_default = model_params.n_gpu_layers;
|
||||||
|
common_params_sampling sampling;
|
||||||
|
temperature_default = sampling.temp;
|
||||||
|
|
||||||
|
if (argc < 2) {
|
||||||
|
printe("Error: No arguments provided.\n");
|
||||||
|
print_help();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Parse arguments
|
// Parse arguments
|
||||||
if (parse(argc, argv)) {
|
if (parse(argc, argv)) {
|
||||||
printe("Error: Failed to parse arguments.\n");
|
printe("Error: Failed to parse arguments.\n");
|
||||||
help();
|
print_help();
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If help is requested, show help and exit
|
// If help is requested, show help and exit
|
||||||
if (help_) {
|
if (help) {
|
||||||
help();
|
print_help();
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
||||||
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
||||||
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
||||||
|
|
||||||
return 0; // Success
|
return 0; // Success
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params;
|
||||||
|
llama_model_params model_params;
|
||||||
std::string model_;
|
std::string model_;
|
||||||
std::string user_;
|
std::string user;
|
||||||
int context_size_ = -1, ngl_ = -1;
|
int context_size = -1, ngl = -1;
|
||||||
bool verbose_ = false;
|
float temperature = -1;
|
||||||
|
bool verbose = false;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool help_ = false;
|
int context_size_default = -1, ngl_default = -1;
|
||||||
|
float temperature_default = -1;
|
||||||
|
bool help = false;
|
||||||
|
|
||||||
bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
|
bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
|
||||||
return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
|
return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
|
||||||
@ -89,6 +111,17 @@ class Opt {
|
|||||||
}
|
}
|
||||||
|
|
||||||
option_value = std::atoi(argv[++i]);
|
option_value = std::atoi(argv[++i]);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
|
||||||
|
if (i + 1 >= argc) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
option_value = std::atof(argv[++i]);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,18 +129,22 @@ class Opt {
|
|||||||
bool options_parsing = true;
|
bool options_parsing = true;
|
||||||
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
||||||
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
||||||
if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
|
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||||
if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
|
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
||||||
|
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (options_parsing &&
|
} else if (options_parsing &&
|
||||||
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||||
verbose_ = true;
|
verbose = true;
|
||||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||||
help_ = true;
|
help = true;
|
||||||
return 0;
|
return 0;
|
||||||
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
||||||
options_parsing = false;
|
options_parsing = false;
|
||||||
@ -120,16 +157,16 @@ class Opt {
|
|||||||
model_ = argv[i];
|
model_ = argv[i];
|
||||||
} else if (positional_args_i == 1) {
|
} else if (positional_args_i == 1) {
|
||||||
++positional_args_i;
|
++positional_args_i;
|
||||||
user_ = argv[i];
|
user = argv[i];
|
||||||
} else {
|
} else {
|
||||||
user_ += " " + std::string(argv[i]);
|
user += " " + std::string(argv[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void help() const {
|
void print_help() const {
|
||||||
printf(
|
printf(
|
||||||
"Description:\n"
|
"Description:\n"
|
||||||
" Runs a llm\n"
|
" Runs a llm\n"
|
||||||
@ -142,6 +179,8 @@ class Opt {
|
|||||||
" Context size (default: %d)\n"
|
" Context size (default: %d)\n"
|
||||||
" -n, --ngl <value>\n"
|
" -n, --ngl <value>\n"
|
||||||
" Number of GPU layers (default: %d)\n"
|
" Number of GPU layers (default: %d)\n"
|
||||||
|
" --temp <value>\n"
|
||||||
|
" Temperature (default: %.1f)\n"
|
||||||
" -v, --verbose, --log-verbose\n"
|
" -v, --verbose, --log-verbose\n"
|
||||||
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
||||||
" -h, --help\n"
|
" -h, --help\n"
|
||||||
@ -170,7 +209,7 @@ class Opt {
|
|||||||
" llama-run file://some-file3.gguf\n"
|
" llama-run file://some-file3.gguf\n"
|
||||||
" llama-run --ngl 999 some-file4.gguf\n"
|
" llama-run --ngl 999 some-file4.gguf\n"
|
||||||
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
||||||
llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
|
context_size_default, ngl_default, temperature_default);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -495,12 +534,12 @@ class LlamaData {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
context = initialize_context(model, opt.context_size_);
|
context = initialize_context(model, opt);
|
||||||
if (!context) {
|
if (!context) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
sampler = initialize_sampler();
|
sampler = initialize_sampler(opt);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -619,14 +658,12 @@ class LlamaData {
|
|||||||
// Initializes the model and returns a unique pointer to it
|
// Initializes the model and returns a unique pointer to it
|
||||||
llama_model_ptr initialize_model(Opt & opt) {
|
llama_model_ptr initialize_model(Opt & opt) {
|
||||||
ggml_backend_load_all();
|
ggml_backend_load_all();
|
||||||
llama_model_params model_params = llama_model_default_params();
|
|
||||||
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
|
|
||||||
resolve_model(opt.model_);
|
resolve_model(opt.model_);
|
||||||
printe(
|
printe(
|
||||||
"\r%*s"
|
"\r%*s"
|
||||||
"\rLoading model",
|
"\rLoading model",
|
||||||
get_terminal_width(), " ");
|
get_terminal_width(), " ");
|
||||||
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
|
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
|
||||||
if (!model) {
|
if (!model) {
|
||||||
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||||
}
|
}
|
||||||
@ -636,10 +673,8 @@ class LlamaData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initializes the context with the specified parameters
|
// Initializes the context with the specified parameters
|
||||||
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
|
llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
|
||||||
ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
|
|
||||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
|
||||||
if (!context) {
|
if (!context) {
|
||||||
printe("%s: error: failed to create the llama_context\n", __func__);
|
printe("%s: error: failed to create the llama_context\n", __func__);
|
||||||
}
|
}
|
||||||
@ -648,10 +683,10 @@ class LlamaData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initializes and configures the sampler
|
// Initializes and configures the sampler
|
||||||
llama_sampler_ptr initialize_sampler() {
|
llama_sampler_ptr initialize_sampler(const Opt & opt) {
|
||||||
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
|
||||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||||
|
|
||||||
return sampler;
|
return sampler;
|
||||||
@ -798,9 +833,9 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to handle user input
|
// Helper function to handle user input
|
||||||
static int handle_user_input(std::string & user_input, const std::string & user_) {
|
static int handle_user_input(std::string & user_input, const std::string & user) {
|
||||||
if (!user_.empty()) {
|
if (!user.empty()) {
|
||||||
user_input = user_;
|
user_input = user;
|
||||||
return 0; // No need for interactive input
|
return 0; // No need for interactive input
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,17 +867,17 @@ static bool is_stdout_a_terminal() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Function to tokenize the prompt
|
// Function to tokenize the prompt
|
||||||
static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||||
int prev_len = 0;
|
int prev_len = 0;
|
||||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||||
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
||||||
while (true) {
|
while (true) {
|
||||||
// Get user input
|
// Get user input
|
||||||
std::string user_input;
|
std::string user_input;
|
||||||
while (handle_user_input(user_input, user_)) {
|
while (handle_user_input(user_input, user)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
add_message("user", user_.empty() ? user_input : user_, llama_data);
|
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||||
int new_len;
|
int new_len;
|
||||||
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -854,7 +889,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!user_.empty()) {
|
if (!user.empty()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -869,7 +904,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
|||||||
|
|
||||||
static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
|
static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
|
||||||
const Opt * opt = static_cast<Opt *>(p);
|
const Opt * opt = static_cast<Opt *>(p);
|
||||||
if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
|
if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
|
||||||
printe("%s", text);
|
printe("%s", text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -890,11 +925,11 @@ int main(int argc, const char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!is_stdin_a_terminal()) {
|
if (!is_stdin_a_terminal()) {
|
||||||
if (!opt.user_.empty()) {
|
if (!opt.user.empty()) {
|
||||||
opt.user_ += "\n\n";
|
opt.user += "\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
opt.user_ += read_pipe_data();
|
opt.user += read_pipe_data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_log_set(log_callback, &opt);
|
llama_log_set(log_callback, &opt);
|
||||||
@ -903,7 +938,7 @@ int main(int argc, const char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chat_loop(llama_data, opt.user_)) {
|
if (chat_loop(llama_data, opt.user)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -724,7 +724,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
|
|||||||
},
|
},
|
||||||
"total_slots": 1,
|
"total_slots": 1,
|
||||||
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||||
"chat_template": "..."
|
"chat_template": "...",
|
||||||
|
"build_info": "b(build number)-(build commit hash)"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -595,10 +595,11 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
json res = json {
|
json res = json {
|
||||||
{"choices", json::array({choice})},
|
{"choices", json::array({choice})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
{"object", "chat.completion"},
|
{"system_fingerprint", build_info},
|
||||||
|
{"object", "chat.completion"},
|
||||||
{"usage", json {
|
{"usage", json {
|
||||||
{"completion_tokens", n_decoded},
|
{"completion_tokens", n_decoded},
|
||||||
{"prompt_tokens", n_prompt_tokens},
|
{"prompt_tokens", n_prompt_tokens},
|
||||||
@ -632,11 +633,12 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
};
|
};
|
||||||
|
|
||||||
json ret = json {
|
json ret = json {
|
||||||
{"choices", json::array({choice})},
|
{"choices", json::array({choice})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
{"object", "chat.completion.chunk"},
|
{"system_fingerprint", build_info},
|
||||||
|
{"object", "chat.completion.chunk"},
|
||||||
{"usage", json {
|
{"usage", json {
|
||||||
{"completion_tokens", n_decoded},
|
{"completion_tokens", n_decoded},
|
||||||
{"prompt_tokens", n_prompt_tokens},
|
{"prompt_tokens", n_prompt_tokens},
|
||||||
@ -761,11 +763,12 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||||||
}
|
}
|
||||||
|
|
||||||
json ret = json {
|
json ret = json {
|
||||||
{"choices", choices},
|
{"choices", choices},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
{"object", "chat.completion.chunk"}
|
{"system_fingerprint", build_info},
|
||||||
|
{"object", "chat.completion.chunk"}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (timings.prompt_n >= 0) {
|
if (timings.prompt_n >= 0) {
|
||||||
@ -3476,6 +3479,7 @@ int main(int argc, char ** argv) {
|
|||||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||||
{ "model_path", ctx_server.params_base.model },
|
{ "model_path", ctx_server.params_base.model },
|
||||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||||
|
{ "build_info", build_info },
|
||||||
};
|
};
|
||||||
|
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
@ -3697,7 +3701,7 @@ int main(int argc, char ** argv) {
|
|||||||
{"object", "list"},
|
{"object", "list"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{
|
{
|
||||||
{"id", params.model_alias},
|
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
||||||
{"object", "model"},
|
{"object", "model"},
|
||||||
{"created", std::time(0)},
|
{"created", std::time(0)},
|
||||||
{"owned_by", "llamacpp"},
|
{"owned_by", "llamacpp"},
|
||||||
|
@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
|||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
||||||
|
assert res.body["system_fingerprint"].startswith("b")
|
||||||
assert res.body["model"] == model if model is not None else server.model_alias
|
assert res.body["model"] == model if model is not None else server.model_alias
|
||||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||||
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
|||||||
last_cmpl_id = None
|
last_cmpl_id = None
|
||||||
for data in res:
|
for data in res:
|
||||||
choice = data["choices"][0]
|
choice = data["choices"][0]
|
||||||
|
assert data["system_fingerprint"].startswith("b")
|
||||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||||
if last_cmpl_id is None:
|
if last_cmpl_id is None:
|
||||||
last_cmpl_id = data["id"]
|
last_cmpl_id = data["id"]
|
||||||
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
|
|||||||
seed=42,
|
seed=42,
|
||||||
temperature=0.8,
|
temperature=0.8,
|
||||||
)
|
)
|
||||||
|
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
|
||||||
assert res.choices[0].finish_reason == "length"
|
assert res.choices[0].finish_reason == "length"
|
||||||
assert res.choices[0].message.content is not None
|
assert res.choices[0].message.content is not None
|
||||||
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
assert match_regex("(Suddenly)+", res.choices[0].message.content)
|
||||||
|
@ -56,6 +56,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||||
|
|
||||||
//
|
//
|
||||||
// tokenizer and input processing utils
|
// tokenizer and input processing utils
|
||||||
//
|
//
|
||||||
|
@ -402,12 +402,16 @@ static std::string get_executable_path() {
|
|||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
return base_path + "/";
|
return base_path + "/";
|
||||||
#elif defined(__linux__)
|
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||||
std::string base_path = ".";
|
std::string base_path = ".";
|
||||||
std::vector<char> path(1024);
|
std::vector<char> path(1024);
|
||||||
while (true) {
|
while (true) {
|
||||||
// get executable path
|
// get executable path
|
||||||
|
# if defined(__linux__)
|
||||||
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
||||||
|
# elif defined(__FreeBSD__)
|
||||||
|
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
||||||
|
# endif
|
||||||
if (len == -1) {
|
if (len == -1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|||||||
#define GGML_F16_STEP 32
|
#define GGML_F16_STEP 32
|
||||||
#define GGML_F16_EPR 4
|
#define GGML_F16_EPR 4
|
||||||
|
|
||||||
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
||||||
float tmp[4];
|
float tmp[4];
|
||||||
|
|
||||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||||
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
|||||||
return _mm_loadu_ps(tmp);
|
return _mm_loadu_ps(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
float arr[4];
|
float arr[4];
|
||||||
|
|
||||||
_mm_storeu_ps(arr, y);
|
_mm_storeu_ps(arr, y);
|
||||||
|
@ -3205,8 +3205,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
// Check if src is pinned memory
|
// Check if src is pinned memory
|
||||||
vk_buffer buf;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
|
||||||
|
|
||||||
const uint64_t ne0 = tensor->ne[0];
|
const uint64_t ne0 = tensor->ne[0];
|
||||||
@ -3269,7 +3269,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|||||||
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||||
|
|
||||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
@ -3302,7 +3302,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|||||||
}
|
}
|
||||||
// Check if src is pinned memory
|
// Check if src is pinned memory
|
||||||
vk_buffer buf = nullptr;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
ggml_vk_host_get(dst->device, src, buf, buf_offset);
|
||||||
|
|
||||||
if (buf != nullptr) {
|
if (buf != nullptr) {
|
||||||
@ -3344,7 +3344,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|||||||
copy_size};
|
copy_size};
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
||||||
|
|
||||||
if (width == spitch) {
|
if (width == spitch) {
|
||||||
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
||||||
@ -3400,7 +3400,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
|||||||
|
|
||||||
// Check if dst is pinned memory
|
// Check if dst is pinned memory
|
||||||
vk_buffer buf = nullptr;
|
vk_buffer buf = nullptr;
|
||||||
size_t buf_offset;
|
size_t buf_offset = 0;
|
||||||
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
ggml_vk_host_get(src->device, dst, buf, buf_offset);
|
||||||
|
|
||||||
std::vector<vk::BufferCopy> slices(1);
|
std::vector<vk::BufferCopy> slices(1);
|
||||||
@ -3480,7 +3480,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
|||||||
|
|
||||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||||
|
|
||||||
vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
|
vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||||
@ -3732,9 +3732,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -3934,9 +3934,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4112,7 +4112,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
|
|
||||||
bool src1_uma = false;
|
bool src1_uma = false;
|
||||||
@ -4300,11 +4300,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
vk_buffer d_ids;
|
vk_buffer d_ids = nullptr;
|
||||||
size_t ids_buf_offset = 0;
|
size_t ids_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4505,11 +4505,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||||||
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||||
|
|
||||||
vk_buffer d_Qx;
|
vk_buffer d_Qx = nullptr;
|
||||||
size_t qx_buf_offset = 0;
|
size_t qx_buf_offset = 0;
|
||||||
vk_buffer d_Qy;
|
vk_buffer d_Qy = nullptr;
|
||||||
size_t qy_buf_offset = 0;
|
size_t qy_buf_offset = 0;
|
||||||
vk_buffer d_ids;
|
vk_buffer d_ids = nullptr;
|
||||||
size_t ids_buf_offset = 0;
|
size_t ids_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
@ -4768,8 +4768,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
|
||||||
vk_buffer d_Q, d_K, d_V, d_D, d_M;
|
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
|
||||||
uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset;
|
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
|
||||||
|
|
||||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
||||||
|
|
||||||
@ -5474,8 +5474,8 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
|
|||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
|
||||||
vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
|
vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
|
||||||
uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
|
size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
|
||||||
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
||||||
|
|
||||||
if (ctx->device->uma) {
|
if (ctx->device->uma) {
|
||||||
|
@ -221,6 +221,7 @@ class GGUFType:
|
|||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
|
DECI = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GROK = auto()
|
GROK = auto()
|
||||||
@ -402,6 +403,7 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
|
MODEL_ARCH.DECI: "deci",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||||
MODEL_ARCH.GROK: "grok",
|
MODEL_ARCH.GROK: "grok",
|
||||||
@ -602,6 +604,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DECI: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GROK: [
|
MODEL_ARCH.GROK: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
@ -1448,6 +1470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DECI: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
MODEL_ARCH.BAICHUAN: [
|
MODEL_ARCH.BAICHUAN: [
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
@ -198,6 +198,7 @@ class TensorNameMap:
|
|||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
||||||
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
|
300
src/llama.cpp
300
src/llama.cpp
@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
|
|||||||
|
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
|
LLM_ARCH_DECI,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
LLM_ARCH_GROK,
|
LLM_ARCH_GROK,
|
||||||
@ -203,6 +204,7 @@ enum llm_arch {
|
|||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
{ LLM_ARCH_GROK, "grok" },
|
{ LLM_ARCH_GROK, "grok" },
|
||||||
{ LLM_ARCH_GPT2, "gpt2" },
|
{ LLM_ARCH_GPT2, "gpt2" },
|
||||||
@ -674,6 +676,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_DECI,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
{
|
{
|
||||||
@ -1673,6 +1701,7 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
LLM_CHAT_TEMPLATE_ZEPHYR,
|
LLM_CHAT_TEMPLATE_ZEPHYR,
|
||||||
LLM_CHAT_TEMPLATE_MONARCH,
|
LLM_CHAT_TEMPLATE_MONARCH,
|
||||||
LLM_CHAT_TEMPLATE_GEMMA,
|
LLM_CHAT_TEMPLATE_GEMMA,
|
||||||
@ -1691,6 +1720,7 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||||
LLM_CHAT_TEMPLATE_GRANITE,
|
LLM_CHAT_TEMPLATE_GRANITE,
|
||||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||||
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1705,6 +1735,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
||||||
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
||||||
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
||||||
@ -1723,6 +1754,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||||
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
||||||
};
|
};
|
||||||
|
|
||||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
@ -5692,7 +5724,7 @@ static void llm_load_hparams(
|
|||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
||||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||||
}
|
}
|
||||||
@ -5732,6 +5764,15 @@ static void llm_load_hparams(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 80: model.type = e_model::MODEL_70B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
@ -6562,7 +6603,8 @@ static void llm_load_vocab(
|
|||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
tokenizer_pre == "llama-v3" ||
|
tokenizer_pre == "llama-v3" ||
|
||||||
tokenizer_pre == "llama-bpe") {
|
tokenizer_pre == "llama-bpe"||
|
||||||
|
tokenizer_pre == "falcon3") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
vocab.tokenizer_ignore_merges = true;
|
vocab.tokenizer_ignore_merges = true;
|
||||||
vocab.tokenizer_add_bos = true;
|
vocab.tokenizer_add_bos = true;
|
||||||
@ -6663,6 +6705,9 @@ static void llm_load_vocab(
|
|||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "minerva-7b") {
|
tokenizer_pre == "minerva-7b") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "megrez") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
@ -7936,6 +7981,68 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
||||||
|
const int64_t n_ff = hparams.n_ff(i);
|
||||||
|
const int64_t n_head = hparams.n_head(i);
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(i);
|
||||||
|
|
||||||
|
if (n_head_kv == 0 && n_head > 0) {
|
||||||
|
// linear attention for DeciLMCausalModel
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
}
|
||||||
|
else if (n_head_kv > 0) {
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||||
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
|
||||||
|
// optional MLP bias
|
||||||
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
{
|
{
|
||||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||||
@ -11305,6 +11412,167 @@ struct llm_build_context {
|
|||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_deci() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||||
|
int32_t n_tokens = this->n_tokens;
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
|
||||||
|
if (n_head == 0) {
|
||||||
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
|
cur = inpL;
|
||||||
|
} else {
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_head > 0 && n_head_kv == 0) {
|
||||||
|
// "linear attention" of Llama-3_1-Nemotron-51B
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
|
cb(cur, "wo", il);
|
||||||
|
} else if (n_head > 0) {
|
||||||
|
// self-attention
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
n_tokens = n_outputs;
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_residual_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
|
if (n_head > 0) {
|
||||||
|
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_residual_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
|
||||||
|
// For Granite architecture
|
||||||
|
if (hparams.f_logit_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_baichuan() {
|
struct ggml_cgraph * build_baichuan() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
@ -17419,6 +17687,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
{
|
{
|
||||||
result = llm.build_llama();
|
result = llm.build_llama();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
|
{
|
||||||
|
result = llm.build_deci();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
{
|
{
|
||||||
result = llm.build_baichuan();
|
result = llm.build_baichuan();
|
||||||
@ -20794,6 +21066,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
@ -22615,6 +22888,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|||||||
}
|
}
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||||
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_FALCON_3;
|
||||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||||
@ -22661,6 +22936,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|||||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||||
|
} else if (tmpl_contains("<|role_start|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_MEGREZ;
|
||||||
}
|
}
|
||||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -22767,6 +23044,15 @@ static int32_t llama_chat_apply_template_internal(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
||||||
|
// Falcon 3
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|" << role << "|>\n" << message->content << "\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
||||||
// zephyr template
|
// zephyr template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
@ -23010,6 +23296,16 @@ static int32_t llama_chat_apply_template_internal(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "assistant<|role_sep|>";
|
ss << "assistant<|role_sep|>";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
|
||||||
|
// Megrez template
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|role_start|>assistant<|role_end|>";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -77,6 +77,8 @@ int main(void) {
|
|||||||
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
||||||
// ai-sage/GigaChat-20B-A3B-instruct
|
// ai-sage/GigaChat-20B-A3B-instruct
|
||||||
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
|
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
|
||||||
|
// Infinigence/Megrez-3B-Instruct
|
||||||
|
u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct,将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
@ -133,6 +135,8 @@ int main(void) {
|
|||||||
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
|
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
|
||||||
// ai-sage/GigaChat-20B-A3B-instruct
|
// ai-sage/GigaChat-20B-A3B-instruct
|
||||||
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
|
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
|
||||||
|
// Infinigence/Megrez-3B-Instruct
|
||||||
|
"<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|> I am an assistant <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
Loading…
Reference in New Issue
Block a user