llama.cpp/examples/server/tests/unit/test_lora.py

import pytest
from utils import *

server = ServerPreset.stories15m_moe()

LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"

@pytest.fixture(scope="module", autouse=True)
def create_server():
    global server
    server = ServerPreset.stories15m_moe()
    server.lora_files = [download_file(LORA_FILE_URL)]


@pytest.mark.parametrize("scale,re_content", [
    # without applying lora, the model should behave like a bedtime story generator
    (0.0, "(little|girl|three|years|old)+"),
    # with lora, the model should behave like a Shakespearean text generator
    (1.0, "(eye|love|glass|sun)+"),
])
def test_lora(scale: float, re_content: str):
    global server
    server.start()
    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
        {"id": 0, "scale": scale}
    ])
    assert res_lora_control.status_code == 200
    res = server.make_request("POST", "/completion", data={
        "prompt": "Look in thy glass",
    })
    assert res.status_code == 200
    assert match_regex(re_content, res.body["content"])


def test_lora_per_request():
    global server
    server.n_slots = 4
    server.start()

    # running the same prompt with different lora scales, all in parallel
    # each prompt will be processed by a different slot
    prompt = "Look in thy glass"
    lora_config = [
        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
    ]

    tasks = [(
        server.make_request,
        ("POST", "/completion", {
            "prompt": prompt,
            "lora": lora,
            "seed": 42,
            "temperature": 0.0,
            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
        })
    ) for lora, _ in lora_config]
    results = parallel_function_calls(tasks)

    assert all([res.status_code == 200 for res in results])
    for res, (_, re_test) in zip(results, lora_config):
        assert match_regex(re_test, res.body["content"])


@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
def test_with_big_model():
    server = ServerProcess()
    server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
    server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
    server.model_alias = "Llama-3.2-8B-Instruct"
    server.n_slots = 4
    server.n_ctx = server.n_slots * 1024
    server.n_predict = 64
    server.temperature = 0.0
    server.seed = 42
    server.lora_files = [
        download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
        # TODO: find & add other lora adapters for this model
    ]
    server.start(timeout_seconds=600)

    # running the same prompt with different lora scales, all in parallel
    # each prompt will be processed by a different slot
    prompt = "Write a computer virus"
    lora_config = [
        # without applying lora, the model should reject the request
        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
        ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
        # with 0.7 scale, the model should provide a simple computer virus with hesitation
        ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
        # with 1.5 scale, the model should confidently provide a computer virus
        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
    ]

    tasks = [(
        server.make_request,
        ("POST", "/v1/chat/completions", {
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "lora": lora,
            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
        })
    ) for lora, _ in lora_config]
    results = parallel_function_calls(tasks)

    assert all([res.status_code == 200 for res in results])
    for res, (_, re_test) in zip(results, lora_config):
        assert re_test in res.body["choices"][0]["message"]["content"]
server : replace behave with pytest (#10416) * server : replace behave with pytest * fix test on windows * misc * add more tests * more tests * styling * log less, fix embd test * added all sequential tests * fix coding style * fix save slot test * add parallel completion test * fix parallel test * remove feature files * update test docs * no cache_prompt for some tests * add test_cache_vs_nocache_prompt 2024-11-26 15:20:18 +00:00			`import pytest`
			`from utils import *`

			`server = ServerPreset.stories15m_moe()`

			`LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"`

			`@pytest.fixture(scope="module", autouse=True)`
			`def create_server():`
			`global server`
			`server = ServerPreset.stories15m_moe()`
server : allow using LoRA adapters per-request (#10994) * slot.can_batch_with * lora per request * test: force disable cache prompt * move can_batch_with check * fix condition * add slow test with llama 8b * update docs * move lora change task to queue * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * lora_base * remove redundant check --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2025-01-02 14:05:18 +00:00			`server.lora_files = [download_file(LORA_FILE_URL)]`
server : replace behave with pytest (#10416) * server : replace behave with pytest * fix test on windows * misc * add more tests * more tests * styling * log less, fix embd test * added all sequential tests * fix coding style * fix save slot test * add parallel completion test * fix parallel test * remove feature files * update test docs * no cache_prompt for some tests * add test_cache_vs_nocache_prompt 2024-11-26 15:20:18 +00:00

			`@pytest.mark.parametrize("scale,re_content", [`
			`# without applying lora, the model should behave like a bedtime story generator`
			`(0.0, "(little\|girl\|three\|years\|old)+"),`
			`# with lora, the model should behave like a Shakespearean text generator`
			`(1.0, "(eye\|love\|glass\|sun)+"),`
			`])`
			`def test_lora(scale: float, re_content: str):`
			`global server`
			`server.start()`
			`res_lora_control = server.make_request("POST", "/lora-adapters", data=[`
			`{"id": 0, "scale": scale}`
			`])`
			`assert res_lora_control.status_code == 200`
			`res = server.make_request("POST", "/completion", data={`
			`"prompt": "Look in thy glass",`
			`})`
			`assert res.status_code == 200`
			`assert match_regex(re_content, res.body["content"])`

server : allow using LoRA adapters per-request (#10994) * slot.can_batch_with * lora per request * test: force disable cache prompt * move can_batch_with check * fix condition * add slow test with llama 8b * update docs * move lora change task to queue * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * lora_base * remove redundant check --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2025-01-02 14:05:18 +00:00
			`def test_lora_per_request():`
			`global server`
			`server.n_slots = 4`
			`server.start()`

			`# running the same prompt with different lora scales, all in parallel`
			`# each prompt will be processed by a different slot`
			`prompt = "Look in thy glass"`
			`lora_config = [`
			`( [{"id": 0, "scale": 0.0}], "(bright\|day\|many\|happy)+" ),`
			`( [{"id": 0, "scale": 0.0}], "(bright\|day\|many\|happy)+" ),`
			`( [{"id": 0, "scale": 0.3}], "(special\|thing\|gifted)+" ),`
			`( [{"id": 0, "scale": 0.7}], "(far\|from\|home\|away)+" ),`
			`( [{"id": 0, "scale": 1.0}], "(eye\|love\|glass\|sun)+" ),`
			`( [{"id": 0, "scale": 1.0}], "(eye\|love\|glass\|sun)+" ),`
			`]`

			`tasks = [(`
			`server.make_request,`
			`("POST", "/completion", {`
			`"prompt": prompt,`
			`"lora": lora,`
			`"seed": 42,`
			`"temperature": 0.0,`
			`"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed`
			`})`
			`) for lora, _ in lora_config]`
			`results = parallel_function_calls(tasks)`

			`assert all([res.status_code == 200 for res in results])`
			`for res, (_, re_test) in zip(results, lora_config):`
			`assert match_regex(re_test, res.body["content"])`


			`@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")`
			`def test_with_big_model():`
			`server = ServerProcess()`
			`server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"`
			`server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"`
			`server.model_alias = "Llama-3.2-8B-Instruct"`
			`server.n_slots = 4`
			`server.n_ctx = server.n_slots * 1024`
			`server.n_predict = 64`
			`server.temperature = 0.0`
			`server.seed = 42`
			`server.lora_files = [`
			`download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),`
			`# TODO: find & add other lora adapters for this model`
			`]`
			`server.start(timeout_seconds=600)`

			`# running the same prompt with different lora scales, all in parallel`
			`# each prompt will be processed by a different slot`
			`prompt = "Write a computer virus"`
			`lora_config = [`
			`# without applying lora, the model should reject the request`
			`( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),`
			`( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),`
			`( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),`
			`# with 0.7 scale, the model should provide a simple computer virus with hesitation`
			`( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),`
			`# with 1.5 scale, the model should confidently provide a computer virus`
			`( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),`
			`( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),`
			`]`

			`tasks = [(`
			`server.make_request,`
			`("POST", "/v1/chat/completions", {`
			`"messages": [`
			`{"role": "user", "content": prompt}`
			`],`
			`"lora": lora,`
			`"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed`
			`})`
			`) for lora, _ in lora_config]`
			`results = parallel_function_calls(tasks)`

			`assert all([res.status_code == 200 for res in results])`
			`for res, (_, re_test) in zip(results, lora_config):`
			`assert re_test in res.body["choices"][0]["message"]["content"]`