Merge branch 'master' into batched-bench

2024-12-27 03:44:35 +00:00 · 2023-10-11 19:18:35 +03:00 · 2023-10-11 19:18:35 +03:00 · 76e17f8d93
commit 76e17f8d93
parent 026bb1b1cd 24ba3d829e
36 changed files with 3118 additions and 454 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -276,6 +276,11 @@ jobs:
        run: |
          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
      - name: Build Swift Example
        id: make_build_swift_example
        run: |
            make swift
  windows-latest-cmake:
    runs-on: windows-latest
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@ -0,0 +1,25 @@
 name: Zig CI
 on:
  pull_request:
  push:
    branches:
      - master
 jobs:
  build:
    strategy:
      fail-fast: false
      matrix:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: recursive
          fetch-depth: 0
      - uses: goto-bus-stop/setup-zig@v2
        with:
          version: 0.11.0
      - name: Build Summary
        run: zig build --summary all -freference-trace
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -663,6 +663,8 @@ add_library(ggml OBJECT
            ggml.h
            ggml-alloc.c
            ggml-alloc.h
            ggml-backend.c
            ggml-backend.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
--- a/14
+++ b/14
@ -518,9 +518,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-OBJS += ggml-alloc.o
+ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
+OBJS += ggml-alloc.o ggml-backend.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common.o: common/common.cpp common/common.h build-info.h common/log.h
@ -623,6 +626,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -643,7 +651,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 run-benchmark-matmult: benchmark-matmult
 	./$@
-.PHONY: run-benchmark-matmult
+.PHONY: run-benchmark-matmult swift
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -1,10 +1,10 @@
-// swift-tools-version:5.3
+// swift-tools-version:5.5
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
-    .macOS(.v11),
+    .macOS(.v12),
    .iOS(.v14),
    .watchOS(.v4),
    .tvOS(.v14)
@ -41,12 +41,13 @@ let package = Package(
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "k_quants.c",
            ] + additionalSources,
            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32"]),
+                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
--- a/README.md
+++ b/README.md
@ -96,6 +96,8 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 **Bindings:**
--- a/build.zig
+++ b/build.zig
@ -36,14 +36,17 @@ const Maker = struct {
    }
    fn init(builder: *std.build.Builder) !Maker {
        // const commit_hash = @embedFile(".git/refs/heads/master");
        const target = builder.standardTargetOptions(.{});
        const zig_version = @import("builtin").zig_version_string;
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
            .{
                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = "12345", // omit newline
+                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
-                .BUILD_COMPILER = "Zig 0.11.0",
+                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
                .BUILD_TARGET = try target.allocDescription(builder.allocator),
            },
        );
@ -67,13 +70,21 @@ const Maker = struct {
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
        o.addConfigHeader(m.config_header);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            if (o.target.getAbi() == .msvc) {
                o.linkLibC(); // need winsdk + crt
            } else {
                // linkLibCpp already add (libc++ + libunwind + libc)
                o.linkLibCpp();
            }
        }
        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
@ -86,8 +97,14 @@ const Maker = struct {
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-        e.linkLibC();
+
        // https://github.com/ziglang/zig/issues/15448
        if (e.target.getAbi() == .msvc) {
            e.linkLibC(); // need winsdk + crt
        } else {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
@ -107,20 +124,21 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("common", "common/console.cpp");
+    const console = make.obj("console", "common/console.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@ -0,0 +1,238 @@
 #!/usr/bin/env python3
 # HF bloom --> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 # Supported Models:
 #   https://huggingface.co/bigscience/bloom-1b7
 #   https://huggingface.co/bigscience/bloom-3b
 #   https://huggingface.co/bigscience/bloom-7b1
 #   https://huggingface.co/Langboat/bloom-1b4-zh
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "BloomForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.BLOOM
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layer"]
 gguf_writer.add_name("Bloom")
 n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
 n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
 gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
 gguf_writer.add_embedding_length(n_embed)
 gguf_writer.add_feed_forward_length(4 * n_embed)
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(n_head)
 gguf_writer.add_head_count_kv(n_head)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 print("gguf: get gpt2 tokenizer vocab")
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
    scores.append(0.0)  # dummy
    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
 # params for qkv transform
 n_head_kv = hparams.get("n_head_kv", n_head)
 head_dim = n_embed // n_head
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(dir_model / part_name, map_location="cpu")
    has_lm_head = True
    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
        has_lm_head = False
    for original_name in model_part.keys():
        data = model_part[original_name]
        name = re.sub(r'transformer\.', '', original_name)
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
            # Map bloom-style qkv_linear to gpt-style qkv_linear
            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
            data = np.concatenate(
                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
                axis=0
            )
            print("re-format attention.linear_qkv.weight")
        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
            data = np.concatenate(
                (qkv_bias[:, 0, :].reshape((n_embed,)),
                 qkv_bias[:, 1, :].reshape((n_embed,)),
                 qkv_bias[:, 2, :].reshape((n_embed,))),
                axis=0
            )
            print("re-format attention.linear_qkv.bias")
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
        if not has_lm_head and name == "word_embeddings.weight":
            gguf_writer.add_tensor("output.weight", data)
            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@ -0,0 +1,216 @@
 #!/usr/bin/env python3
 # HF mpt--> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
    parser.add_argument(
        "--vocab-only", action="store_true",
        help="extract only the vocab",
    )
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input",
    )
    parser.add_argument(
        "model", type=Path,
        help="directory containing model file, or model file itself (*.bin)",
    )
    parser.add_argument(
        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
        help="output format - use 0 for float32, 1 for float16",
    )
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "MPTForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.MPT
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layers"]
 gguf_writer.add_name(dir_model.name)
 gguf_writer.add_context_length(hparams["max_seq_len"])
 gguf_writer.add_embedding_length(hparams["d_model"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
 gguf_writer.add_head_count(hparams["n_heads"])
 gguf_writer.add_layer_norm_eps(1e-05)
 if hparams["attn_config"]["clip_qkv"] is not None:
    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
 gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 print("gguf: get gpt2 tokenizer vocab")
 # MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
 # there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
 # accomodate some "reserved" tokens; this is causing problems down the line in
 # llama.cpp, so we pad the vocab with dummy tokens:
 vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
    scores.append(0.0) # dummy
    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Cannot map tensor '" + name + "'")
            continue # for the sake of compatibility with some old published models, don't quit
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
        # note: MPT output is tied to (same as) wte in original model;
        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
        if new_name == "token_embd.weight":
            gguf_writer.add_tensor("output.weight", data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@ -17,33 +17,6 @@ if "NO_LOCAL_GGUF" not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
 import gguf
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
@ -153,53 +126,25 @@ tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_json_file = dir_model / "tokenizer.json"
 if not tokenizer_json_file.is_file():
    print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = (
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-    hparams["vocab_size"]
+assert max(tokenizer.vocab.values()) < vocab_size
    if "vocab_size" in hparams
    else len(tokenizer_json["model"]["vocab"])
 )
 tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
-    if i in reverse_vocab:
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-        text = reverse_vocab[i]
+    scores.append(0.0) # dummy
-        try:
+    toktypes.append(gguf.TokenType.NORMAL)
            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode("utf-8"))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
    scores.append(0.0)  # dymmy
    toktypes.append(gguf.TokenType.NORMAL)  # dummy
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@ -0,0 +1,9 @@
 .DS_Store
 /.build
 /Packages
 xcuserdata/
 DerivedData/
 .swiftpm/configuration/registries.json
 .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
 .netrc
 batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@ -0,0 +1,6 @@
 .PHONY: build
 build:
 	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
 	rm -f ./batched_swift
 	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@ -0,0 +1,22 @@
 // swift-tools-version: 5.5
 // The swift-tools-version declares the minimum version of Swift required to build this package.
 import PackageDescription
 let package = Package(
    name: "batched_swift",
    platforms: [.macOS(.v12)],
    dependencies: [
        .package(name: "llama", path: "../../"),
    ],
    targets: [
        // Targets are the basic building blocks of a package, defining a module or a test suite.
        // Targets can depend on other targets in this package and products from dependencies.
        .executableTarget(
            name: "batched_swift",
            dependencies: ["llama"],
            path: "Sources",
            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
        ),
    ]
 )
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -0,0 +1,4 @@
 This is a swift clone of `examples/batched`.
 $ `make`
 $ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -0,0 +1,255 @@
 import Foundation
 import llama
 let arguments = CommandLine.arguments
 // Check that we have at least one argument (the model path)
 guard arguments.count > 1 else {
    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
    exit(1)
 }
 let modelPath: String = arguments[1]
 let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
 let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
 // total length of the sequences including the prompt
 let n_len: Int = 32
 // init LLM
 llama_backend_init(false)
 defer {
    llama_backend_free()
 }
 let model_params = llama_model_default_params()
 guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
    print("Failed to load model")
    exit(1)
 }
 defer {
    llama_free_model(model)
 }
 var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
 var context_params = llama_context_default_params()
 context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
 let context = llama_new_context_with_model(model, context_params)
 guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
 defer {
    llama_free(context)
 }
 let n_ctx = llama_n_ctx(context)
 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
 if n_kv_req > n_ctx {
    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
    exit(1)
 }
 var buffer: [CChar] = []
 for id: llama_token in tokens {
    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
 }
 print("\n")
 var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
 defer {
    llama_batch_free(batch)
 }
 // evaluate the initial prompt
 batch.n_tokens = Int32(tokens.count)
 for (i, token) in tokens.enumerated() {
    batch.token[i] = token
    batch.pos[i] = Int32(i)
    batch.seq_id[i] = 0
    batch.logits[i] = 0
 }
 // llama_decode will output logits only for the last token of the prompt
 batch.logits[Int(batch.n_tokens) - 1] = 1
 if llama_decode(context, batch) != 0 {
    print("llama_decode() failed")
    exit(1)
 }
 for i in 1 ..< n_parallel {
    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 if n_parallel > 1 {
    print("generating \(n_parallel) sequences ...\n")
 }
 var streams: [String] = .init(repeating: "", count: n_parallel)
 var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
 var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
 var n_cur = batch.n_tokens
 var n_decode = 0
 let t_main_start = ggml_time_us()
 while n_cur <= n_len {
    // prepare the next batch
    batch.n_tokens = 0
    // sample the next token for each parallel sequence / stream
    for i in 0 ..< n_parallel {
        if i_batch[i] < 0 {
            // the stream has already finished
            continue
        }
        var n_vocab = llama_n_vocab(model)
        var logits = llama_get_logits_ith(context, i_batch[i])
        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
        for token_id in 0 ..< n_vocab {
            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
        }
        var candidates_p: llama_token_data_array = .init(
            data: &candidates,
            size: candidates.count,
            sorted: false
        )
        let top_k: Int32 = 40
        let top_p: Float = 0.9
        let temp: Float = 0.4
        llama_sample_top_k(context, &candidates_p, top_k, 1)
        llama_sample_top_p(context, &candidates_p, top_p, 1)
        llama_sample_temp(context, &candidates_p, temp)
        let new_token_id = llama_sample_token(context, &candidates_p)
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
        // is it an end of stream? -> mark the stream as finished
        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
                print("stream \(i) finished at n_cur = \(n_cur)")
            }
            continue
        }
        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
        // if there is only one stream, we print immediately to stdout
        if n_parallel == 1 {
            print(nextStringPiece, terminator: "")
        }
        streams[i] += nextStringPiece
        // push this new token for next evaluation
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
        batch.logits[Int(batch.n_tokens)] = 1
        i_batch[i] = batch.n_tokens
        batch.n_tokens += 1
        n_decode += 1
    }
    // all streams are finished
    if batch.n_tokens == 0 {
        break
    }
    n_cur += 1
    // evaluate the current batch with the transformer model
    if llama_decode(context, batch) != 0 {
        print("llama_decode() failed")
        exit(1)
    }
 }
 if n_parallel > 1 {
    print("\n")
    for (i, stream) in streams.enumerated() {
        print("sequence \(i):\n\n\(prompt)\(stream)\n")
    }
 }
 let t_main_end = ggml_time_us()
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
 llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
    }
    tokens.deallocate()
    return swiftTokens
 }
 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
        if result.count >= -Int(nTokens) {
            result.removeLast(-Int(nTokens))
        } else {
            result.removeAll()
        }
        let check = llama_token_to_piece(
            model,
            token,
            &result,
            Int32(result.count)
        )
        assert(check == nTokens)
    } else {
        result.removeLast(result.count - Int(nTokens))
    }
    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
        return utfString
    } else {
        buffer.append(contentsOf: result)
        let data = Data(buffer.map { UInt8(bitPattern: $0) })
        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
            buffer = []
        }
        guard let bufferString = String(data: data, encoding: .utf8) else {
            return nil
        }
        buffer = []
        return bufferString
    }
    return nil
 }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -233,10 +233,22 @@ int main(int argc, char ** argv) {
    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
+    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
+    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
    const int space_token = 29871;
    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
        inp_sfx.erase(inp_sfx.begin());
    }
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
    if (add_bos) {
        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
    }
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
@ -627,10 +639,27 @@ int main(int argc, char ** argv) {
                buffer.clear();
                // done taking input, reset color
                console::set_display(console::reset);
                if (params.escape) {
                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
                    process_escapes(params.input_prefix);
                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
+                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
+                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
                if (add_bos) {
                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
                }
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(params.n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0);
    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -344,9 +344,20 @@ struct llama_server_context
    void loadInfill()
    {
-        auto prefix_tokens = tokenize(params.input_prefix, true);  // always add BOS
+        bool suff_rm_leading_spc = true;
-        auto suffix_tokens = tokenize(params.input_suffix, true);  // always add BOS
+        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
            params.input_suffix.erase(0, 1);
            suff_rm_leading_spc = false;
        }
        auto prefix_tokens = tokenize(params.input_prefix, false);
        auto suffix_tokens = tokenize(params.input_suffix, false);
        const int space_token = 29871;
        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
            suffix_tokens.erase(suffix_tokens.begin());
        }
        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
        prefix_tokens.push_back(llama_token_middle(ctx));
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -1,4 +1,5 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@ -6,25 +7,6 @@
 #include <stdlib.h>
 #include <string.h>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/types.h>
            #include <sys/mman.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <memoryapi.h>
 #endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@ -80,8 +62,9 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 struct ggml_allocr {
    struct ggml_backend_buffer * buffer;
    bool buffer_owned;
    void * data;
    size_t size;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);
    UNUSED(alloc);
 }
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
+    return tensor->buffer == alloc->buffer;
    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
 static bool ggml_is_view(struct ggml_tensor * t) {
@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
+
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    tensor->data = addr;
    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
    tensor->buffer = alloc->buffer;
    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, tensor);
@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
    if (ggml_allocr_is_own(alloc, tensor) == false) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
        return;
    }
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    void * ptr = tensor->data;
    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
+
    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = alloc->size - align_offset;
+    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 }
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
    *alloc = (struct ggml_allocr){
-        /*.data          = */ data,
+        /*.buffer        = */ buffer,
-        /*.size          = */ size,
+        /*.buffer_owned  = */ true,
        /*.base          = */ ggml_backend_buffer_get_base(buffer),
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }
 // OS specific functions to allocate and free uncommitted virtual memory
 static void * alloc_vmem(size_t size) {
 #if defined(_WIN32)
    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
 #elif defined(_POSIX_MAPPED_FILES)
    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
    if (ptr == MAP_FAILED) {
        return NULL;
    }
    return ptr;
 #else
    // use a fixed address for other platforms
    uintptr_t base_addr = (uintptr_t)-size - 0x100;
    return (void *)base_addr;
 #endif
 }
 static void free_vmem(void * base_addr, size_t size) {
 #if defined(_WIN32)
    VirtualFree(base_addr, 0, MEM_RELEASE);
    UNUSED(size);
 #elif defined(_POSIX_MAPPED_FILES)
    munmap(base_addr, size);
 #else
    // nothing to do
    UNUSED(base_addr);
    UNUSED(size);
 #endif
 }
 // allocate uncommitted virtual memory to measure the size of the graph
 static void alloc_measure_vmem(void ** base_addr, size_t * size) {
    // 128GB for 64-bit, 1GB for 32-bit
    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
    do {
        *base_addr = alloc_vmem(*size);
        if (*base_addr != NULL) {
            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
            return;
        }
        // try again with half the size
        *size /= 2;
    } while (*size > 0);
    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
 }
 static void free_measure_vmem(void * base_addr, size_t size) {
    free_vmem(base_addr, size);
 }
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
    alloc->measure = true;
-    void * base_addr;
+    return alloc;
-    size_t size;
+}
-    alloc_measure_vmem(&base_addr, &size);
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
    *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
+        /*.buffer        = */ buffer,
-        /*.size          = */ size,
+        /*.buffer_owned  = */ false,
-        /*.alignment     = */ alignment,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
-        /*.measure       = */ true,
+        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
+    if (alloc->buffer_owned) {
-        free_measure_vmem(alloc->data, alloc->size);
+        ggml_backend_buffer_free(alloc->buffer);
    }
    free(alloc);
 }
@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_ROPE:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_CONT:
            return true;
        default:
@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
    }
 }
 static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
    assert(view->view_src != NULL && view->view_src->data != NULL);
    view->backend = view->view_src->backend;
    view->buffer  = view->view_src->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;
    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
    ggml_backend_buffer_init_tensor(alloc->buffer, view);
 }
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            assert(node->view_src->data != NULL);
+            init_view(alloc, node);
            node->data = (char *)node->view_src->data + node->view_offs;
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->data = parent->data;
+                                node->view_src = view_src;
                                view_src_hn->n_views += 1;
                                init_view(alloc, node);
                                return;
                            }
                        }
                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->data = parent->data;
+                            node->view_src = parent;
                            p_hn->n_views += 1;
                            init_view(alloc, node);
                            return;
                        }
                    }
@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }
-static size_t ggml_allocr_alloc_graph_tensors_n(
+size_t ggml_allocr_alloc_graph_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
            if (ggml_is_view(node)) {
                struct ggml_tensor * view_src = node->view_src;
                hash_get(ht, view_src)->n_views += 1;
                if (node->buffer == NULL && node->data != NULL) {
                    // view of a pre-allocated tensor, didn't call init_view() yet
                    init_view(alloc, node);
                }
            }
            for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                    break;
                }
                hash_get(ht, parent)->n_children += 1;
                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
                    init_view(alloc, parent);
                }
            }
        }
    }
@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
 }
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -6,9 +6,11 @@
 extern "C" {
 #endif
 struct ggml_backend_buffer;
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
@ -21,6 +23,10 @@ GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
 GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
 GGML_API size_t ggml_allocr_alloc_graph_n(
                    struct ggml_allocr * alloc,
                    struct ggml_cgraph ** graphs, int n_graphs,
                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 #ifdef  __cplusplus
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -0,0 +1,385 @@
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
 #include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define UNUSED GGML_UNUSED
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // backend buffer
 ggml_backend_buffer_t ggml_backend_buffer_init(
        struct ggml_backend                  * backend,
        struct ggml_backend_buffer_i           iface,
               ggml_backend_buffer_context_t   context,
               size_t                          size) {
    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
    GGML_ASSERT(iface.get_base != NULL);
    (*buffer) = (struct ggml_backend_buffer) {
        /* .interface = */ iface,
        /* .backend   = */ backend,
        /* .context   = */ context,
        /* .size      = */ size,
    };
    return buffer;
 }
 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
    if (buffer->iface.free_buffer != NULL) {
        buffer->iface.free_buffer(buffer);
    }
    free(buffer);
 }
 size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
    return ggml_backend_get_alignment(buffer->backend);
 }
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_base(buffer);
 }
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
    return buffer->size;
 }
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.get_alloc_size) {
        return buffer->iface.get_alloc_size(buffer, tensor);
    }
    return ggml_nbytes(tensor);
 }
 void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.init_tensor) {
        buffer->iface.init_tensor(buffer, tensor);
    }
 }
 void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.free_tensor) {
        buffer->iface.free_tensor(buffer, tensor);
    }
 }
 // backend
 ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
    return tensor->buffer->backend;
 }
 const char * ggml_backend_name(ggml_backend_t backend) {
    return backend->iface.get_name(backend);
 }
 void ggml_backend_free(ggml_backend_t backend) {
    backend->iface.free(backend);
 }
 ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
    return backend->iface.alloc_buffer(backend, size);
 }
 size_t ggml_backend_get_alignment(ggml_backend_t backend) {
    return backend->iface.get_alignment(backend);
 }
 void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
 }
 void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
 }
 void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
 }
 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
 }
 void ggml_backend_synchronize(ggml_backend_t backend) {
    backend->iface.synchronize(backend);
 }
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    return backend->iface.graph_plan_create(backend, cgraph);
 }
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    backend->iface.graph_plan_free(backend, plan);
 }
 void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    backend->iface.graph_plan_compute(backend, plan);
 }
 void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    backend->iface.graph_compute(backend, cgraph);
 }
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return backend->iface.supports_op(backend, op);
 }
 // backend copy
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
    if (a->type != b->type) {
        return false;
    }
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (a->ne[i] != b->ne[i]) {
            return false;
        }
        if (a->nb[i] != b->nb[i]) {
            return false;
        }
    }
    return true;
 }
 void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
    if (src == dst) {
        return;
    }
    // TODO: allow backends to support copy to/from same backend
    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
    } else {
        // shouldn't be hit when copying from/to CPU
        #ifndef NDEBUG
        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
        #endif
        size_t nbytes = ggml_nbytes(src);
        void * data = malloc(nbytes);
        ggml_backend_tensor_get(src, data, 0, nbytes);
        ggml_backend_tensor_set(dst, data, 0, nbytes);
        free(data);
    }
 }
 // backend CPU
 struct ggml_backend_cpu_context {
    int n_threads;
    void * work_data;
    size_t work_size;
 };
 static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
    return "CPU";
    UNUSED(backend);
 }
 static void ggml_backend_cpu_free(ggml_backend_t backend) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    free(cpu_ctx->work_data);
    free(cpu_ctx);
    free(backend);
 }
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
    return (void *)buffer->context;
 }
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
    UNUSED(buffer);
 }
 static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL, // no initialization required
    /* .free_tensor    = */ NULL, // no cleanup required
 };
 // for buffers from ptr, free is not called
 static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL,
    /* .free_tensor    = */ NULL,
 };
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
 }
 static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
    return TENSOR_ALIGNMENT;
    UNUSED(backend);
 }
 static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy((char *)tensor->data + offset, data, size);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy(data, (const char *)tensor->data + offset, size);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }
 static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 struct ggml_backend_plan_cpu {
    struct ggml_cplan cplan;
    struct ggml_cgraph cgraph;
 };
 static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    cpu_plan->cgraph = *cgraph;
    if (cpu_plan->cplan.work_size > 0) {
        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
    }
    return cpu_plan;
 }
 static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
    free(cpu_plan->cplan.work_data);
    free(cpu_plan);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    if (cpu_ctx->work_size < cplan.work_size) {
        // TODO: may be faster to free and use malloc to avoid the copy
        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
        cpu_ctx->work_size = cplan.work_size;
    }
    cplan.work_data = cpu_ctx->work_data;
    ggml_graph_compute(cgraph, &cplan);
 }
 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return true;
    UNUSED(backend);
    UNUSED(op);
 }
 static struct ggml_backend_i cpu_backend_i = {
    /* .get_name            = */ ggml_backend_cpu_name,
    /* .free                = */ ggml_backend_cpu_free,
    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
    /* .synchronize         = */ ggml_backend_cpu_synchronize,
    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
    /* .supports_op         = */ ggml_backend_cpu_supports_op,
 };
 ggml_backend_t ggml_backend_cpu_init(void) {
    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
    ctx->n_threads = GGML_DEFAULT_N_THREADS;
    ctx->work_data = NULL;
    ctx->work_size = 0;
    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
    *cpu_backend = (struct ggml_backend) {
        /* .interface = */ cpu_backend_i,
        /* .context   = */ ctx
    };
    return cpu_backend;
 }
 bool ggml_backend_is_cpu(ggml_backend_t backend) {
    return backend->iface.get_name == ggml_backend_cpu_name;
 }
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
    ctx->n_threads = n_threads;
 }
 ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -0,0 +1,143 @@
 #pragma once
 #include "ggml.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    struct ggml_backend;
    struct ggml_backend_buffer;
    // type-erased backend-specific types / wrappers
    typedef void * ggml_backend_context_t;
    typedef void * ggml_backend_graph_plan_t;
    typedef void * ggml_backend_buffer_context_t;
    // avoid accessing internals of these types
    typedef struct ggml_backend        * ggml_backend_t;
    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
    //
    // backend buffer
    //
    struct ggml_backend_buffer_i {
        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
    };
    // TODO: hide behind API
    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i iface;
        ggml_backend_t                backend;
        ggml_backend_buffer_context_t context;
        size_t size;
    };
    // backend buffer functions
    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
            struct ggml_backend                  * backend,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);
    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    //
    // backend
    //
    struct ggml_backend_i {
        const char * (*get_name)(ggml_backend_t backend);
        void (*free)(ggml_backend_t backend);
        // buffer allocation
        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
        // get buffer alignment
        size_t (*get_alignment)(ggml_backend_t backend);
        // tensor data access
        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        void (*synchronize)     (ggml_backend_t backend);
        // (optional) copy tensor between different backends, allow for single-copy tranfers
        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan
        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
    };
    // TODO: hide behind API
    struct ggml_backend {
        struct ggml_backend_i iface;
        ggml_backend_context_t context;
    };
    // backend helper functions
    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -62,6 +62,7 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
 #define CUDA_CLAMP_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 #define CUDA_GET_ROWS_BLOCK_SIZE 256
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    if (col >= ncols) {
        return;
    }
    const int r = y[row];
    // copy x[r*ncols + col] to dst[row*ncols + col]
    const int xi = r*ncols + col;
    const int di = row*ncols + col;
    const int ib = xi/qk; // block index
    const int iqs = (xi%qk)/qr; // quant index
    const int iybs = di - di%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;
    // dequantize
    dfloat2 v;
    dequantize_kernel(x, ib, iqs, v);
    dst[iybs + iqs + 0]        = v.x;
    dst[iybs + iqs + y_offset] = v.y;
 }
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
    dst[i] = scale * x[i];
 }
 static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
 }
 template<int qk, int qr, dequantize_kernel_t dq>
 static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
    const dim3 block_nums(block_num_x, nrows, 1);
    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
 }
 static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
 }
 template<typename T>
 static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@ -5703,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
        kind = cudaMemcpyDeviceToDevice;
-        struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
        int id;
        CUDA_CHECK(cudaGetDevice(&id));
        src_ptr = (char *) extra->data_device[id];
@ -5739,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    }
 }
 static void ggml_cuda_op_repeat(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t ne3 = dst->ne[3];
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    const size_t nb00 = src0->nb[0];
    const size_t nb01 = src0->nb[1];
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
    const int nr0 = (int)(ne0/ne00);
    const int nr1 = (int)(ne1/ne01);
    const int nr2 = (int)(ne2/ne02);
    const int nr3 = (int)(ne3/ne03);
    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
    for                         (int i3 = 0; i3 < nr3;  i3++) {
        for                     (int k3 = 0; k3 < ne03; k3++) {
            for                 (int i2 = 0; i2 < nr2;  i2++) {
                for             (int k2 = 0; k2 < ne02; k2++) {
                    for         (int i1 = 0; i1 < nr1;  i1++) {
                        for     (int k1 = 0; k1 < ne01; k1++) {
                            for (int i0 = 0; i0 < nr0;  i0++) {
                                CUDA_CHECK(cudaMemcpyAsync(
                                              (char *)  dst_d + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
                                        (const char *) src0_d + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
                                        ne00*nb0, cudaMemcpyDeviceToDevice, stream));
                            }
                        }
                    }
                }
            }
        }
    }
    (void) src1;
    (void) src1_d;
 }
 static void ggml_cuda_op_get_rows(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(ggml_is_contiguous(dst));
    const int ncols = src0->ne[0];
    const int nrows = ggml_nelements(src1);
    const int32_t * src1_i32 = (const int32_t *) src1_d;
    switch (src0->type) {
        case GGML_TYPE_F16:
            get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_F32:
            get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_Q4_0:
            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_Q4_1:
            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_Q5_0:
            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_Q5_1:
            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        case GGML_TYPE_Q8_0:
            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
            break;
        default:
            // TODO: k-quants
            GGML_ASSERT(false);
            break;
    }
 }
 inline void ggml_cuda_op_add(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@ -6279,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
    const int64_t ne02 = src0->ne[2];
    const int64_t nrows = ggml_nrows(src0);
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-    GGML_ASSERT(ne01 + n_past == ne00);
+    //GGML_ASSERT(ne01 + n_past == ne00);
    GGML_ASSERT(n_head == ne02);
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@ -6343,7 +6498,14 @@ inline void ggml_cuda_op_scale(
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    const float scale = ((float *) src1->data)[0];
+    float scale;
    // HACK: support for ggml backend interface
    if (src1->backend == GGML_BACKEND_CPU) {
        scale = ((float *) src1->data)[0];
    } else {
        // TODO: pass pointer to kernel instead of copying to host
        CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
    }
    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());
@ -6353,6 +6515,24 @@ inline void ggml_cuda_op_scale(
    (void) src1_dd;
 }
 inline void ggml_cuda_op_clamp(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    const float min = ((float *) dst->op_params)[0];
    const float max = ((float *) dst->op_params)[1];
    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());
    (void) src1;
    (void) dst;
    (void) src1_dd;
 }
 static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
    const int64_t nrows0 = ggml_nrows(src0);
@ -6362,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
-    struct ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    struct ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@ -6505,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
    const size_t q8_1_ts = sizeof(block_q8_1);
    const size_t q8_1_bs = QK8_1;
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    struct ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
    const bool src0_is_contiguous = ggml_is_contiguous(src0);
@ -6585,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
        if (convert_src1_to_q8_1) {
            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
-            if (split && src1_on_device && src1_is_contiguous) {
+            if (src1_on_device && src1_is_contiguous) {
                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                CUDA_CHECK(cudaGetLastError());
            }
@ -6667,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
                    GGML_ASSERT(false);
                }
-                if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                    CUDA_CHECK(cudaGetLastError());
                }
@ -6758,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
    }
 }
 static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
 }
 static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
 }
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
@ -6812,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@ -6843,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
    const int64_t row_stride_x = nb01 / sizeof(half);
@ -6870,7 +7058,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        }
    }
-    if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
@ -6906,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }
 static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
 }
 static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));
@ -6935,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@ -6991,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
    const size_t nb1 = tensor->nb[1];
-    ggml_backend backend = tensor->backend;
+    ggml_backend_type backend = tensor->backend;
-    struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
    memset(extra, 0, sizeof(*extra));
    for (int64_t id = 0; id < g_device_count; ++id) {
@ -7046,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
        }
        CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
        extra->data_device[id] = buf;
@ -7085,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
    delete extra;
 }
-static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
-static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
    if (g_temp_tensor_extras == nullptr) {
        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
    }
    size_t alloc_index = g_temp_tensor_extra_index;
    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
-    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
    memset(extra, 0, sizeof(*extra));
    return extra;
@ -7123,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
        return;
    }
-    struct ggml_tensor_extra_gpu * extra;
+    ggml_tensor_extra_gpu * extra;
    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
        tensor->op == GGML_OP_VIEW ||
@ -7132,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
        size_t offset = 0;
        if (tensor->op == GGML_OP_VIEW) {
@ -7141,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
        extra = ggml_cuda_alloc_temp_tensor_extra();
        extra->data_device[g_main_device] = src0_ddc + offset;
    } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
        void * src1_ddv = src1_extra->data_device[g_main_device];
        extra = ggml_cuda_alloc_temp_tensor_extra();
        extra->data_device[g_main_device] = src1_ddv;
@ -7183,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
        CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
    }
-    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
        tensor->op == GGML_OP_VIEW;
    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
        size_t view_offset = 0;
        if (tensor->op == GGML_OP_VIEW) {
@ -7207,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
    GGML_ASSERT(ggml_is_contiguous(tensor));
-    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }
@ -7270,52 +7461,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-    switch (tensor->op) {
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
        case GGML_OP_DUP:
            if (!any_on_device) {
        return false;
    }
    switch (tensor->op) {
        case GGML_OP_REPEAT:
            func = ggml_cuda_repeat;
            break;
        case GGML_OP_GET_ROWS:
            func = ggml_cuda_get_rows;
            break;
        case GGML_OP_DUP:
            func = ggml_cuda_dup;
            break;
        case GGML_OP_ADD:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_add;
            break;
        case GGML_OP_MUL:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_mul;
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(tensor)) {
                case GGML_UNARY_OP_GELU:
                    if (!any_on_device) {
                        return false;
                    }
                    func = ggml_cuda_gelu;
                    break;
                case GGML_UNARY_OP_SILU:
                    if (!any_on_device) {
                        return false;
                    }
                    func = ggml_cuda_silu;
                    break;
                default:
                    return false;
            } break;
        case GGML_OP_NORM:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_norm;
            break;
        case GGML_OP_RMS_NORM:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
@ -7325,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            func = ggml_cuda_mul_mat;
            break;
        case GGML_OP_SCALE:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_scale;
            break;
-        case GGML_OP_CPY:
+        case GGML_OP_CLAMP:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_clamp;
            break;
        case GGML_OP_CPY:
            func = ggml_cuda_cpy;
            break;
        case GGML_OP_CONT:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_dup;
            break;
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_nop;
            break;
        case GGML_OP_DIAG_MASK_INF:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_diag_mask_inf;
            break;
        case GGML_OP_SOFT_MAX:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_soft_max;
            break;
        case GGML_OP_ROPE:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_rope;
            break;
        case GGML_OP_ALIBI:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_alibi;
            break;
        default:
@ -7400,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
    snprintf(description, description_size, "%s", prop.name);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
 #define UNUSED GGML_UNUSED
 struct ggml_backend_context_cuda {
 };
 static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
    return GGML_CUDA_NAME;
    UNUSED(backend);
 }
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
    delete cuda_ctx;
    delete backend;
 }
 struct ggml_backend_buffer_context_cuda {
    void * device;
    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
    size_t temp_tensor_extra_index = 0;
    ~ggml_backend_buffer_context_cuda() {
        delete[] temp_tensor_extras;
    }
    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
        if (temp_tensor_extras == nullptr) {
            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
        }
        size_t alloc_index = temp_tensor_extra_index;
        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
        memset(extra, 0, sizeof(*extra));
        return extra;
    }
 };
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
    CUDA_CHECK(cudaFree(ctx->device));
    delete ctx;
 }
 static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
    return ctx->device;
 }
 static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    int64_t row_low = 0;
    int64_t row_high = ggml_nrows(tensor);
    int64_t nrows_split = row_high - row_low;
    size_t size = ggml_nbytes_split(tensor, nrows_split);
    int64_t ne0 = tensor->ne[0];
    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
        }
    }
    return size;
    UNUSED(buffer);
 }
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
    if (tensor->view_src != NULL && tensor->view_offs == 0) {
        assert(tensor->view_src->buffer->backend == buffer->backend);
        tensor->backend = tensor->view_src->backend;
        tensor->extra = tensor->view_src->extra;
        return;
    }
    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
    extra->data_device[g_main_device] = tensor->data;
    tensor->backend = GGML_BACKEND_GPU;
    tensor->extra = extra;
    if (ggml_is_quantized(tensor->type)) {
        // initialize padding to 0 to avoid possible NaN values
        int64_t row_low = 0;
        int64_t row_high = ggml_nrows(tensor);
        int64_t nrows_split = row_high - row_low;
        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
        size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
        if (padded_size > original_size && tensor->view_src == nullptr) {
            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
        }
    }
    UNUSED(buffer);
 }
 static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
    /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
    /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
    /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
    /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
    /* .free_tensor    = */ NULL,
 };
 static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
    ggml_cuda_set_device(g_main_device);
    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
    CUDA_CHECK(cudaMalloc(&ctx->device, size));
    return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
 }
 static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
    return 128;
    UNUSED(backend);
 }
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
    UNUSED(backend);
 }
 static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
    UNUSED(backend);
 }
 static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
    UNUSED(backend);
 }
 static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
    GGML_ASSERT(!"not implemented");
    return nullptr;
    UNUSED(backend);
    UNUSED(cgraph);
 }
 static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    GGML_ASSERT(!"not implemented");
    UNUSED(backend);
    UNUSED(plan);
 }
 static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    GGML_ASSERT(!"not implemented");
    UNUSED(backend);
    UNUSED(plan);
 }
 static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_cuda_set_device(g_main_device);
    ggml_compute_params params = {};
    params.type = GGML_TASK_COMPUTE;
    params.ith = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];
        assert(node->backend == GGML_BACKEND_GPU);
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] != nullptr) {
                assert(node->src[j]->backend == GGML_BACKEND_GPU);
            }
        }
        bool ok = ggml_cuda_compute_forward(&params, node);
        if (!ok) {
            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
        }
        GGML_ASSERT(ok);
 #if 0
        if (node->type == GGML_TYPE_F32) {
            cudaDeviceSynchronize();
            std::vector<float> tmp(ggml_nelements(node), 0.0f);
            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
                ggml_type_name(node->src[0]->type),
                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
                node->src[0]->name,
                node->src[1] ? node->src[1]->name : "none");
            double sum = 0.0;
            double sq_sum = 0.0;
            for (int i = 0; i < ggml_nelements(node); i++) {
                printf("%f ", tmp[i]);
                sum += tmp[i];
                sq_sum += tmp[i]*tmp[i];
            }
            printf("\n");
            printf("sum: %f, ", sum);
            printf("sq_sum: %f\n", sq_sum);
        }
 #endif
    }
    UNUSED(backend);
 }
 static ggml_backend_i cuda_backend_i = {
    /* .get_name            = */ ggml_backend_cuda_name,
    /* .free                = */ ggml_backend_cuda_free,
    /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
    /* .get_alignment       = */ ggml_backend_cuda_get_alignment,
    /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
    /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
    /* .synchronize         = */ ggml_backend_cuda_synchronize,
    /* .cpy_tensor_from     = */ nullptr,
    /* .cpy_tensor_to       = */ nullptr,
    /* .graph_plan_create   = */ ggml_backend_cuda_graph_plan_create,
    /* .graph_plan_free     = */ ggml_backend_cuda_graph_plan_free,
    /* .graph_plan_compute  = */ ggml_backend_cuda_graph_plan_compute,
    /* .graph_compute       = */ ggml_backend_cuda_graph_compute,
    /* .supports_op         = */ nullptr,
 };
 ggml_backend_t ggml_backend_cuda_init() {
    ggml_init_cublas(); // TODO: remove from ggml.c
    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
    ggml_backend_t cuda_backend = new ggml_backend {
        /* .interface = */ cuda_backend_i,
        /* .context   = */ ctx
    };
    return cuda_backend;
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -1,6 +1,7 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@ -42,6 +43,9 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 // backend API
 GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -20,6 +20,7 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stddef.h>
 #include <stdbool.h>
@ -35,10 +36,15 @@ struct ggml_cgraph;
 extern "C" {
 #endif
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+//
 // internal API
 // temporary exposed to user-code
 //
 struct ggml_metal_context;
 void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 //
 // backend API
 // user-code should use only these functions
 //
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_CONCAT:
                        {
                            const int64_t nb = ne00;
                            int64_t nb = ne00;
                            [encoder setComputePipelineState:ctx->pipeline_concat];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@ -812,6 +812,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
                            const int nth = MIN(1024, ne0);
                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ADD:
@ -909,9 +910,10 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);
                            GGML_ASSERT(n % 4 == 0);
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
@ -921,9 +923,10 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
@ -941,9 +944,10 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
                                {
@ -1040,7 +1044,7 @@ void ggml_metal_graph_compute(
                                !ggml_is_transposed(src0) &&
                                !ggml_is_transposed(src1) &&
                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 &&
+                                ne00 % 32 == 0 && ne00 >= 64 &&
                                ne11 > ne11_mm_min) {
                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                switch (src0->type) {
@ -1251,6 +1255,8 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
                            GGML_ASSERT(ne00 % 4 == 0);
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));
@ -1293,7 +1299,7 @@ void ggml_metal_graph_compute(
                            const int nth = MIN(1024, ne00);
-                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+                            //const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_head = ((int32_t *) dst->op_params)[1];
                            float max_bias;
                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@ -1456,3 +1462,140 @@ void ggml_metal_graph_compute(
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
    return "Metal";
    UNUSED(backend);
 }
 static void ggml_backend_metal_free(ggml_backend_t backend) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_free(ctx);
    free(backend);
 }
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
    return (void *)buffer->context;
 }
 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
    UNUSED(buffer);
 }
 static struct ggml_backend_buffer_i metal_backend_buffer_i = {
    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL, // no initialization required
    /* .free_tensor    = */ NULL, // no cleanup required
 };
 static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    void * data = ggml_metal_host_malloc(size);
    // TODO: set proper name of the buffers
    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
 }
 static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
    return 32;
    UNUSED(backend);
 }
 static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy((char *)tensor->data + offset, data, size);
    UNUSED(backend);
 }
 static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy(data, (const char *)tensor->data + offset, size);
    UNUSED(backend);
 }
 static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }
 static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return true;
    UNUSED(backend);
    UNUSED(op);
 }
 static struct ggml_backend_i metal_backend_i = {
    /* .get_name            = */ ggml_backend_metal_name,
    /* .free                = */ ggml_backend_metal_free,
    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
    /* .synchronize         = */ ggml_backend_metal_synchronize,
    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
    /* .graph_plan_free     = */ NULL,
    /* .graph_plan_compute  = */ NULL,
    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
    /* .supports_op         = */ ggml_backend_metal_supports_op,
 };
 ggml_backend_t ggml_backend_metal_init(void) {
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
    *metal_backend = (struct ggml_backend) {
        /* .interface = */ metal_backend_i,
        /* .context   = */ ctx,
    };
    return metal_backend;
 }
 bool ggml_backend_is_metal(ggml_backend_t backend) {
    return backend->iface.get_name == ggml_backend_metal_name;
 }
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_set_n_cb(ctx, n_cb);
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -347,6 +347,7 @@ kernel void kernel_rms_norm(
        uint   ntg[[threads_per_threadgroup]]) {
    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
    device const float  * x_scalar = (device const float  *) x;
    float4 sumf = 0;
    float all_sum = 0;
@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
       }
    }
    if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
            sum[0] += x_scalar[i];
        }
        sum[0] /= ne00;
    }
@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
        y[i00] = x[i00] * scale;
    }
    if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
            y_scalar[i00] = x_scalar[i00] * scale;
        }
    }
 }
--- a/ggml.c
+++ b/ggml.c
@ -162,40 +162,16 @@ typedef void * thread_ret_t;
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 //
 // end of logging block
 //
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif
 //
 // logging
 //
 #if (GGML_DEBUG >= 1)
 #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG(...)
 #endif
 #if (GGML_DEBUG >= 5)
 #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_5(...)
 #endif
 #if (GGML_DEBUG >= 10)
 #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_10(...)
 #endif
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 //
 // end of logging block
 //
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
    *result = (struct ggml_tensor) {
        /*.type         =*/ type,
        /*.backend      =*/ GGML_BACKEND_CPU,
        /*.buffer       =*/ NULL,
        /*.n_dims       =*/ n_dims,
        /*.ne           =*/ { 1, 1, 1, 1 },
        /*.nb           =*/ { 0, 0, 0, 0 },
@ -13082,24 +13059,22 @@ static void ggml_compute_forward_alibi_f32(
        return;
    }
-    const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-    assert(n_past >= 0);
+    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int64_t n  = ggml_nrows(src0);
-    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
    const int ne2 = src0->ne[2]; // n_head -> this is k
    //const int ne3 = src0->ne[3]; // 1 -> bsz
-    const int n  = ggml_nrows(src0);
+    const size_t nb0 = src0->nb[0];
-    const int ne2_ne3 = n/ne1; // ne2*ne3
+    const size_t nb1 = src0->nb[1];
-
+    const size_t nb2 = src0->nb[2];
    const int nb0 = src0->nb[0];
    const int nb1 = src0->nb[1];
    const int nb2 = src0->nb[2];
    //const int nb3 = src0->nb[3];
    GGML_ASSERT(nb0 == sizeof(float));
@ -13111,9 +13086,9 @@ static void ggml_compute_forward_alibi_f32(
    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-    for (int i = 0; i < ne0; i++) {
+    for (int64_t i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
+        for (int64_t j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
+            for (int64_t k = 0; k < ne2_ne3; k++) {
                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
@ -13128,7 +13103,6 @@ static void ggml_compute_forward_alibi_f32(
                }
                pdst[0] = i * m_k + src[0];
            }
        }
    }
@ -20203,6 +20177,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        ggml_vec_cpy_f32(nx, xp, x);
        ggml_vec_cpy_f32(nx, gp, g);
        // TODO: instead of passing &cancel here, use the return code of the linesearch
        //       to determine if the optimization should be cancelled
        //       this is a simple change, but not doing this atm, since I don't have a nice
        //       way to test and don't want to break something with so many changes lined up
        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
        if (cancel) {
            return GGML_OPT_CANCEL;
--- a/ggml.h
+++ b/ggml.h
@ -326,7 +326,7 @@ extern "C" {
        GGML_TYPE_COUNT,
    };
-    enum ggml_backend {
+    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
        GGML_BACKEND_GPU_SPLIT = 20,
@ -480,7 +480,9 @@ extern "C" {
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type         type;
-        enum ggml_backend backend;
+        enum ggml_backend_type backend;
        struct ggml_backend_buffer * buffer;
        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -514,7 +516,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[4];
+        char padding[12];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -1358,7 +1360,7 @@ extern "C" {
    // alibi position embedding
    // in-place, returns view(a)
-    struct ggml_tensor * ggml_alibi(
+    GGML_API struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
@ -1367,7 +1369,7 @@ extern "C" {
    // clamp
    // in-place, returns view(a)
-    struct ggml_tensor * ggml_clamp(
+    GGML_API struct ggml_tensor * ggml_clamp(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            float                 min,
@ -2102,7 +2104,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;
-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -88,10 +88,12 @@ class MODEL_ARCH(IntEnum):
    PERSIMMON     : int = auto()
    REFACT        : int = auto()
    BERT          : int = auto()
    BLOOM         : int = auto()
 class MODEL_TENSOR(IntEnum):
    TOKEN_EMBD      : int = auto()
    TOKEN_EMBD_NORM : int = auto()
    TOKEN_TYPES     : int = auto()
    POS_EMBD        : int = auto()
    OUTPUT          : int = auto()
@ -125,10 +127,12 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.PERSIMMON:      "persimmon",
    MODEL_ARCH.REFACT:         "refact",
    MODEL_ARCH.BERT:           "bert",
    MODEL_ARCH.BLOOM:          "bloom",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
    MODEL_TENSOR.POS_EMBD:        "position_embd",
    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
@ -282,6 +286,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.BLOOM: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GPT2: [
        # TODO
    ],
@ -311,6 +327,7 @@ class TensorNameMap:
            "gpt_neox.embed_in",                        # gptneox
            "transformer.wte",                          # gpt2 gpt-j mpt refact
            "transformer.word_embeddings",              # falcon
            "word_embeddings",                          # bloom
            "model.embed_tokens",                       # llama-hf
            "tok_embeddings",                           # llama-pth
            "embeddings.word_embeddings",               # bert
@ -322,6 +339,11 @@ class TensorNameMap:
            "embeddings.token_type_embeddings",  # bert
        ),
        # Normalization of token embeddings
        MODEL_TENSOR.TOKEN_EMBD_NORM: (
            "word_embeddings_layernorm",  # bloom
        ),
        # Position embeddings
        MODEL_TENSOR.POS_EMBD: (
            "transformer.wpe",                 # gpt2
@ -332,7 +354,7 @@ class TensorNameMap:
        MODEL_TENSOR.OUTPUT: (
            "embed_out",                # gptneox
            "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
-            "output",                   # llama-pth
+            "output",                   # llama-pth bloom
            "word_embeddings_for_head", # persimmon
        ),
@ -344,7 +366,7 @@ class TensorNameMap:
            "norm",                                   # llama-pth
            "embeddings.LayerNorm",                   # bert
            "transformer.norm_f",                     # mpt
-            "ln_f",                                   # refact
+            "ln_f",                                   # refact bloom
            "language_model.encoder.final_layernorm", # persimmon
        ),
@ -361,6 +383,7 @@ class TensorNameMap:
            "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
            "transformer.blocks.{bid}.norm_1",                     # mpt
            "transformer.h.{bid}.input_layernorm",                 # falcon7b
            "h.{bid}.input_layernorm",                             # bloom
            "transformer.h.{bid}.ln_mlp",                          # falcon40b
            "model.layers.{bid}.input_layernorm",                  # llama-hf
            "layers.{bid}.attention_norm",                         # llama-pth
@ -379,6 +402,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.c_attn",                                    # gpt2
            "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
            "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
            "h.{bid}.self_attention.query_key_value",                             # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
        ),
@ -412,6 +436,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
            "transformer.blocks.{bid}.attn.out_proj",                  # mpt
            "transformer.h.{bid}.self_attention.dense",                # falcon
            "h.{bid}.self_attention.dense",                            # bloom
            "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
            "layers.{bid}.attention.wo",                               # llama-pth
            "encoder.layer.{bid}.attention.output.dense",              # bert
@ -429,6 +454,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_NORM: (
            "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
            "transformer.h.{bid}.ln_2",                                     # gpt2 refact
            "h.{bid}.post_attention_layernorm",                             # bloom
            "transformer.blocks.{bid}.norm_2",                              # mpt
            "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
            "layers.{bid}.ffn_norm",                                        # llama-pth
@ -442,6 +468,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.c_fc",                          # gpt2
            "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
            "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
            "h.{bid}.mlp.dense_h_to_4h",                             # bloom
            "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
            "layers.{bid}.feed_forward.w3",                          # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                # bert
@ -461,6 +488,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
            "transformer.blocks.{bid}.ffn.down_proj",                # mpt
            "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
            "h.{bid}.mlp.dense_4h_to_h",                             # bloom
            "model.layers.{bid}.mlp.down_proj",                      # llama-hf
            "layers.{bid}.feed_forward.w2",                          # llama-pth
            "encoder.layer.{bid}.output.dense",                      # bert
--- a/llama.cpp
+++ b/llama.cpp
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -2,6 +2,7 @@
 cp -rpv ../ggml/src/ggml.c                  ./ggml.c
 cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
 cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
 cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
 cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
@ -11,6 +12,7 @@ cp -rpv ../ggml/src/ggml-metal.m          ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
 cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
 cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
 cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   Hello"              , {     258,  23090, }, },
        { "    Hello"             , {     466,  23090, }, },
        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
        { "\n ="                  , {    1212,     40, }, },
        { "' era"                 , {      18,   4932, }, },
    };
    return _k_tests;
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
            }
            for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
            ofs << "\n";
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -41,6 +41,8 @@ tests = [
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
        "\n =",
        "' era",
    ]
 for text in tests:
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
-                f.write(str(x) + ' ')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -174,10 +174,8 @@ int main(int argc, char **argv) {
            }
            for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
            ofs << "\n";
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@ -81,15 +81,14 @@ fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s, add_bos=True)
        # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
-                f.write(str(x) + ' ')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)