diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c1e36ee28..e41be76db 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index e61bfc6c3..57db17512 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -36,8 +36,9 @@ jobs: poetry install - name: Build package - run: poetry build + run: cd gguf-py && poetry build - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} + packages-dir: gguf-py/dist diff --git a/.gitignore b/.gitignore index 4d5767d22..420e0d6d0 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ *.gcno *.gcda *.dot +*.metallib .DS_Store .build/ .cache/ diff --git a/Package.swift b/Package.swift index 3ee3b2a20..1ea414cc1 100644 --- a/Package.swift +++ b/Package.swift @@ -10,15 +10,18 @@ let platforms: [SupportedPlatform]? = [ .tvOS(.v14) ] let exclude: [String] = [] -let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"] +let resources: [Resource] = [ + .process("ggml-metal.metal") +] +let additionalSources: [String] = ["ggml-metal.m"] let additionalSettings: [CSetting] = [ .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_SWIFT"), .define("GGML_USE_METAL") ] #else let platforms: [SupportedPlatform]? = nil let exclude: [String] = ["ggml-metal.metal"] +let resources: [Resource] = [] let additionalSources: [String] = [] let additionalSettings: [CSetting] = [] #endif @@ -40,6 +43,7 @@ let package = Package( "ggml-alloc.c", "k_quants.c", ] + additionalSources, + resources: resources, publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32"]), diff --git a/build.zig b/build.zig index 3a8978bc3..b95491e03 100644 --- a/build.zig +++ b/build.zig @@ -111,12 +111,14 @@ pub fn build(b: *std.build.Builder) !void { const common = make.obj("common", "common/common.cpp"); const console = make.obj("common", "common/console.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); + const train = make.obj("train", "common/train.cpp"); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); if (server.target.isWindows()) { diff --git a/common/common.cpp b/common/common.cpp index 60b00b5fb..0f55c33a7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // store the external file name in params params.prompt_file = argv[i]; std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { + if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n-predict") { @@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.cfg_negative_prompt)); - if (params.cfg_negative_prompt.back() == '\n') { + if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') { params.cfg_negative_prompt.pop_back(); } } else if (arg == "--cfg-scale") { diff --git a/ggml-metal.m b/ggml-metal.m index 4dc0341c9..046fe46b8 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -81,18 +81,18 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(norm); - GGML_METAL_DECL_KERNEL(mul_mat_f32_f32); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); @@ -185,56 +185,44 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); -#ifdef GGML_SWIFT - // load the default.metallib file + // load library { - NSError * error = nil; - - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"]; - NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath]; - NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"]; - NSURL * libURL = [NSURL fileURLWithPath:libPath]; - - // Load the metallib file into a Metal library - ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; - - if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } - } + NSBundle * bundle = nil; +#ifdef SWIFT_PACKAGE + bundle = SWIFTPM_MODULE_BUNDLE; #else - UNUSED(msl_library_source); - - // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource - { + bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; +#endif NSError * error = nil; + NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (libPath != nil) { + NSURL * libURL = [NSURL fileURLWithPath:libPath]; + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); + ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + } else { + GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); - - NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; - if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } + NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); + NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; + if (error) { + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + MTLCompileOptions* options = nil; #ifdef GGML_QKK_64 - MTLCompileOptions* options = [MTLCompileOptions new]; - options.preprocessorMacros = @{ @"QK_K" : @(64) }; - ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; -#else - ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; + options = [MTLCompileOptions new]; + options.preprocessorMacros = @{ @"QK_K" : @(64) }; #endif + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + } + if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } -#endif // load kernels { @@ -274,28 +262,30 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(norm); - GGML_METAL_ADD_KERNEL(mul_mat_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); + if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { + GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + } GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(alibi_f32); @@ -308,8 +298,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX + // print MTL GPU family: + GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + + // determine max supported GPU family + // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf + // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i); + break; + } + } + + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); @@ -351,28 +354,30 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(norm); - GGML_METAL_DEL_KERNEL(mul_mat_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); + if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { + GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + } GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(alibi_f32); @@ -437,7 +442,7 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru for (int i = 0; i < ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; - //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; @@ -1002,21 +1007,46 @@ void ggml_metal_graph_compute( } break; case GGML_OP_MUL_MAT: { - // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 - GGML_ASSERT(ne00 == ne10); - // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere - uint gqa = ne12/ne02; GGML_ASSERT(ne03 == ne13); + const uint gqa = ne12/ne02; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 1; + +#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach + if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { + switch (src0t) { + case GGML_TYPE_F16: ne11_mm_min = 2; break; + case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; + case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; + case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; + case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; + case GGML_TYPE_Q5_0: // not tested yet + case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet + case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; + default: ne11_mm_min = 1; break; + } + } +#endif + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if (!ggml_is_transposed(src0) && + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && - [ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne00%32 == 0 && - ne11 > 2) { + ne00 % 32 == 0 && + ne11 > ne11_mm_min) { + //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; @@ -1045,17 +1075,18 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); // use custom matrix x vector kernel switch (src0t) { case GGML_TYPE_F32: { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32]; nrows = 4; } break; case GGML_TYPE_F16: @@ -1063,12 +1094,12 @@ void ggml_metal_graph_compute( nth0 = 32; nth1 = 1; if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; nrows = ne11; } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; nrows = 4; } } break; @@ -1079,7 +1110,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; } break; case GGML_TYPE_Q4_1: { @@ -1088,7 +1119,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; } break; case GGML_TYPE_Q8_0: { @@ -1097,7 +1128,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; } break; case GGML_TYPE_Q2_K: { @@ -1106,7 +1137,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; } break; case GGML_TYPE_Q3_K: { @@ -1115,7 +1146,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; } break; case GGML_TYPE_Q4_K: { @@ -1124,7 +1155,7 @@ void ggml_metal_graph_compute( nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; } break; case GGML_TYPE_Q5_K: { @@ -1133,7 +1164,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; } break; case GGML_TYPE_Q6_K: { @@ -1142,7 +1173,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; } break; default: { @@ -1171,7 +1202,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) { + src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q4_K) { diff --git a/ggml-metal.metal b/ggml-metal.metal index 0df4923fa..99b9fd7a7 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -13,8 +13,8 @@ typedef struct { #define QK4_1 32 typedef struct { - half d; // delta - half m; // min + half d; // delta + half m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; @@ -429,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre } // putting them in the kernel cause a significant performance penalty -#define N_DST 4 // each SIMD group works on 4 rows -#define N_SIMDGROUP 2 // number of SIMD groups in a thread group +#define N_DST 4 // each SIMD group works on 4 rows +#define N_SIMDGROUP 2 // number of SIMD groups in a thread group #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 //Note: This is a template, but strictly speaking it only applies to // quantizations where the block size is 32. It also does not @@ -441,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; + const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q_type * x = (device const block_q_type *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - float yl[16]; // src1 vector cache - float sumf[nr]={0.f}; - const int ix = tiisg/2; - const int il = 8*(tiisg%2); + float yl[16]; // src1 vector cache + float sumf[nr] = {0.f}; + + const int ix = (tiisg/2); + const int il = (tiisg%2)*8; device const float * yb = y + ix * QK4_0 + il; @@ -463,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device sumy += yb[i] + yb[i+1]; yl[i+0] = yb[i+ 0]; yl[i+1] = yb[i+ 1]/256.f; + sumy += yb[i+16] + yb[i+17]; yl[i+8] = yb[i+16]/16.f; yl[i+9] = yb[i+17]/4096.f; @@ -478,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device for (int row = 0; row < nr; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0 && first_row + row < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; + dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot; } } } -kernel void kernel_mul_mat_q4_0_f32( +kernel void kernel_mul_mv_q4_0_f32( device const void * src0, device const float * src1, device float * dst, @@ -496,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32( constant int64_t & ne1[[buffer(16)]], constant uint & gqa[[buffer(17)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } -kernel void kernel_mul_mat_q4_1_f32( +kernel void kernel_mul_mv_q4_1_f32( device const void * src0, device const float * src1, device float * dst, @@ -521,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32( #define NB_Q8_0 8 -kernel void kernel_mul_mat_q8_0_f32( +kernel void kernel_mul_mv_q8_0_f32( device const void * src0, device const float * src1, device float * dst, @@ -585,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32( #define N_F32_F32 4 -kernel void kernel_mul_mat_f32_f32( +kernel void kernel_mul_mv_f32_f32( device const char * src0, device const char * src1, device float * dst, @@ -656,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32( } } -kernel void kernel_mul_mat_f16_f32_1row( +kernel void kernel_mul_mv_f16_f32_1row( device const char * src0, device const char * src1, device float * dst, @@ -675,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row( constant int64_t & ne0, constant int64_t & ne1, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + uint tiisg[[thread_index_in_simdgroup]]) { const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; @@ -710,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row( #define N_F16_F32 4 -kernel void kernel_mul_mat_f16_f32( +kernel void kernel_mul_mv_f16_f32( device const char * src0, device const char * src1, device float * dst, @@ -782,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32( } // Assumes row size (ne00) is a multiple of 4 -kernel void kernel_mul_mat_f16_f32_l4( +kernel void kernel_mul_mv_f16_f32_l4( device const char * src0, device const char * src1, device float * dst, @@ -1259,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { //====================================== dot products ========================= -kernel void kernel_mul_mat_q2_K_f32( +kernel void kernel_mul_mv_q2_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1403,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32( } #if QK_K == 256 -kernel void kernel_mul_mat_q3_K_f32( +kernel void kernel_mul_mv_q3_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1555,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32( } } #else -kernel void kernel_mul_mat_q3_K_f32( +kernel void kernel_mul_mv_q3_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1626,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32( #endif #if QK_K == 256 -kernel void kernel_mul_mat_q4_K_f32( +kernel void kernel_mul_mv_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1732,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32( } } #else -kernel void kernel_mul_mat_q4_K_f32( +kernel void kernel_mul_mv_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1821,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32( } #endif -kernel void kernel_mul_mat_q5_K_f32( +kernel void kernel_mul_mv_q5_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1994,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32( } -kernel void kernel_mul_mat_q6_K_f32( +kernel void kernel_mul_mv_q6_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -2332,7 +2338,7 @@ kernel void kernel_get_rows( } #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A -#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B #define BLOCK_SIZE_K 32 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B @@ -2369,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0, const uint r0 = tgpig.y; const uint r1 = tgpig.x; const uint im = tgpig.z; + // if this block is of 64x32 shape or smaller short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + // a thread shouldn't load data outside of the matrix short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; @@ -2395,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0, + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { - //load data and store to threadgroup memory + // load data and store to threadgroup memory half4x4 temp_a; dequantize_func(x, il, temp_a); threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) for (int i = 0; i < 16; i++) { *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ - + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \ - + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \ - = *((device float2x4 *)y); + + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + il = (il + 2 < nl) ? il + 2 : il % 2; x = (il < 2) ? x + (2+nl-1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); - //load matrices from threadgroup memory and conduct outer products + + // load matrices from threadgroup memory and conduct outer products threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + #pragma unroll(4) for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { #pragma unroll(4) @@ -2429,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0, lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + #pragma unroll(8) for (int i = 0; i < 8; i++){ simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); @@ -2437,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0, } if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { - device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \ - + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0; + device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0; for (int i = 0; i < 8; i++) { simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); } } else { // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; for (int i = 0; i < 8; i++) { simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); } threadgroup_barrier(mem_flags::mem_threadgroup); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { + + device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg == 0) { for (int i = 0; i < n_rows; i++) { - for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) { + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); } } diff --git a/gguf-py/README.md b/gguf-py/README.md index ffe25c495..a28d8c57a 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -69,4 +69,3 @@ python -m twine upload dist/* ## TODO - [ ] Add tests - [ ] Include conversion scripts as command line entry points in this package. -- Add CI workflow for releasing the package. diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 400607ce1..07a7ab4dd 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.4.0" +version = "0.4.4" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/llama.cpp b/llama.cpp index 9bdc242d3..89220a675 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2051,7 +2051,7 @@ static void llm_load_hparams( case 36: model.type = e_model::MODEL_8B; break; default: model.type = e_model::MODEL_UNKNOWN; } - } + } break; case LLM_ARCH_REFACT: { GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); @@ -4930,7 +4930,7 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_PERSIMMON: { result = llm_build_persimmon(lctx, batch); - } + } break; case LLM_ARCH_REFACT: { result = llm_build_refact(lctx, batch); @@ -7198,6 +7198,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } std::ofstream fout(fname_out, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors const size_t meta_size = gguf_get_meta_size(ctx_out); diff --git a/requirements.txt b/requirements.txt index 7dc51edb1..81c909d0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -numpy==1.24 +numpy==1.24.4 sentencepiece==0.1.98 gguf>=0.1.0