diff --git a/common/common.cpp b/common/common.cpp index 715adf946..9fa184725 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -251,6 +251,57 @@ int32_t cpu_get_num_math() { return cpu_get_num_physical_cores(); } +// Helper for setting process priority + +#if defined(_WIN32) + +bool set_process_priority(enum ggml_sched_priority prio) { + if (prio == GGML_SCHED_PRIO_NORMAL) { + return true; + } + + DWORD p = NORMAL_PRIORITY_CLASS; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; + } + + if (!SetPriorityClass(GetCurrentProcess(), p)) { + fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + + return true; +} + +#else // MacOS and POSIX +#include +#include + +bool set_process_priority(enum ggml_sched_priority prio) { + if (prio == GGML_SCHED_PRIO_NORMAL) { + return true; + } + + int p = 0; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = 0; break; + case GGML_SCHED_PRIO_MEDIUM: p = -5; break; + case GGML_SCHED_PRIO_HIGH: p = -10; break; + case GGML_SCHED_PRIO_REALTIME: p = -20; break; + } + + if (!setpriority(PRIO_PROCESS, 0, p)) { + fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); + return false; + } + return true; +} + +#endif + // // CLI argument parsing // @@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) { } } +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { + int32_t n_set = 0; + + if (cpuparams.n_threads < 0) { + // Assuming everything about cpuparams is invalid + if (role_model != nullptr) { + cpuparams = *role_model; + } else { + cpuparams.n_threads = cpu_get_num_math(); + } + } + + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + if (cpuparams.cpumask[i]) { + n_set++; + } + } + + if (n_set && n_set < cpuparams.n_threads) { + // Not enough set bits, may experience performance issues. + fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + } +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } @@ -331,7 +411,7 @@ void gpt_params_parse_from_env(gpt_params & params) { get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); get_env("LLAMA_ARG_HF_REPO", params.hf_repo); get_env("LLAMA_ARG_HF_FILE", params.hf_file); - get_env("LLAMA_ARG_THREADS", params.n_threads); + get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads); get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); get_env("LLAMA_ARG_BATCH", params.n_batch); @@ -368,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { + size_t dash_loc = range.find('-'); + if (dash_loc == std::string::npos) { + fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + return false; + } + + size_t start_i; + size_t end_i; + + if (dash_loc == 0) { + start_i = 0; + } else { + start_i = std::stoull(range.substr(0, dash_loc)); + if (start_i >= GGML_MAX_N_THREADS) { + fprintf(stderr, "Start index out of bounds!\n"); + return false; + } + } + + if (dash_loc == range.length() - 1) { + end_i = GGML_MAX_N_THREADS - 1; + } else { + end_i = std::stoull(range.substr(dash_loc + 1)); + if (end_i >= GGML_MAX_N_THREADS) { + fprintf(stderr, "End index out of bounds!\n"); + return false; + } + } + + for (size_t i = start_i; i <= end_i; i++) { + boolmask[i] = true; + } + + return true; +} + +bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) { + // Discard potential 0x prefix + size_t start_i = 0; + if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { + start_i = 2; + } + + size_t num_digits = mask.length() - start_i; + if (num_digits > 128) num_digits = 128; + + size_t end_i = num_digits + start_i; + + for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) { + char c = mask.at(i); + int8_t id = c; + + if ((c >= '0' && c <= '9')) { + id -= '0'; + } else if (c >= 'a' && c <= 'f') { + id -= 'a' - 10; + } else if (c >= 'A' && c <= 'F') { + id -= 'A' - 10; + } else { + fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + return false; + } + + boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0); + boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0); + boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0); + boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0); + } + + return true; +} + #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { @@ -384,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "-t" || arg == "--threads") { CHECK_ARG - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); + params.cpuparams.n_threads = std::stoi(argv[i]); + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-C" || arg == "--cpu-mask") { + CHECK_ARG + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + return true; + } + if (arg == "-Cr" || arg == "--cpu-range") { + CHECK_ARG + std::string range = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); + return true; + } + if (arg == "--prio") { + CHECK_ARG + params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict") { + CHECK_ARG + params.cpuparams.strict_cpu = std::stoul(argv[i]); + return true; + } + if (arg == "--poll") { + CHECK_ARG + params.cpuparams.poll = std::stoul(argv[i]); + return true; + } if (arg == "-tb" || arg == "--threads-batch") { CHECK_ARG - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); + params.cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-Cb" || arg == "--cpu-mask-batch") { + CHECK_ARG + std::string mask = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "-Crb" || arg == "--cpu-range_batch") { + CHECK_ARG + std::string range = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-batch") { + CHECK_ARG + params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-batch") { + params.cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-batch") { + CHECK_ARG + params.cpuparams_batch.poll = std::stoul(argv[i]); + return true; + } if (arg == "-td" || arg == "--threads-draft") { CHECK_ARG - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); + params.draft_cpuparams.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; + } + if (arg == "-Cd" || arg == "--cpu-mask-draft") { + CHECK_ARG + std::string mask = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "-Crd" || arg == "--cpu-range-draft") { + CHECK_ARG + std::string range = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "--prio-draft") { + CHECK_ARG + params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-draft") { + params.draft_cpuparams.strict_cpu = true; + return true; + } + if (arg == "--poll-draft") { + CHECK_ARG + params.draft_cpuparams.poll = std::stoul(argv[i]); + return true; } if (arg == "-tbd" || arg == "--threads-batch-draft") { CHECK_ARG - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); + params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") { + CHECK_ARG + std::string range = argv[i]; + params.draft_cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-batch-draft") { + CHECK_ARG + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-batch-draft") { + params.draft_cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-batch-draft") { + CHECK_ARG + params.draft_cpuparams_batch.poll = std::stoul(argv[i]); + return true; + } if (arg == "-p" || arg == "--prompt") { CHECK_ARG params.prompt = argv[i]; @@ -1498,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads }); options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); - options.push_back({ "speculative", "-tbd, --threads-batch-draft N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + +#ifndef GGML_USE_OPENMP + // these options are available only with the internal threadpool + options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"}); + options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); + options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); + options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); + options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll}); + + options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); + options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"}); + options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"}); + options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"}); + options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"}); + + options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"}); + options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"}); + options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"}); + options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"}); + options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"}); + + options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"}); + options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"}); + options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)"}); + options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"}); + options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"}); +#endif // GGML_USE_OPENMP + options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", @@ -1774,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); printf("usage: %s [options]\n", argv[0]); @@ -1806,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; - os << "system_info: n_threads = " << params.n_threads; - if (params.n_threads_batch != -1) { - os << " (n_threads_batch = " << params.n_threads_batch << ")"; + os << "system_info: n_threads = " << params.cpuparams.n_threads; + if (params.cpuparams_batch.n_threads != -1) { + os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")"; } #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later // TODO: windows + arm64 + mingw64 @@ -2332,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.n_threads = params.cpuparams.n_threads; + cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; @@ -2359,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param return cparams; } +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { + struct ggml_threadpool_params tpp; + + ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults + + if (params.mask_valid) { + std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS); + } + + tpp.prio = params.priority; + tpp.poll = params.poll; + tpp.strict_cpu = params.strict_cpu; + + return tpp; +} + #ifdef LLAMA_USE_CURL static bool starts_with(const std::string & str, const std::string & prefix) { @@ -3348,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index f603ba2be..cb5e7f6df 100644 --- a/common/common.h +++ b/common/common.h @@ -67,13 +67,18 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; +struct cpu_params { + int n_threads = -1; + bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. + bool mask_valid = false; // Default: any CPU + enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + bool strict_cpu = false; // Use strict CPU placement + uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) +}; + struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = cpu_get_num_math(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -100,6 +105,11 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + struct cpu_params draft_cpuparams; + struct cpu_params draft_cpuparams_batch; + ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -204,7 +214,7 @@ struct gpt_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int32_t n_threads_http = -1; // number of threads to process HTTP requests + int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; std::string public_path = ""; @@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params); +bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool set_process_priority(enum ggml_sched_priority prio); + // // String utils // @@ -327,8 +342,9 @@ struct llama_init_result { struct llama_init_result llama_init_from_gpt_params(gpt_params & params); -struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index aca332e94..3ce91070b 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f; #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 47cb16c69..97622f4f4 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -21,7 +21,7 @@ #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); @@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) struct benchmark_params_struct { - int32_t n_threads = 1; + int n_threads = 1; int32_t n_iterations = 10; }; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 8fa492571..a68268388 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -486,8 +486,8 @@ int main(int argc, char ** argv) { if (use_pca) { // run PCA PCA::pca_params pca_params; - pca_params.n_threads = params.n_threads; - pca_params.n_batch = params.n_pca_batch; + pca_params.n_threads = params.cpuparams.n_threads; + pca_params.n_batch = params.n_pca_batch; pca_params.n_iterations = params.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); } else { diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index c7e5ca788..8df457e21 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -410,7 +410,7 @@ int main(int argc, char ** argv) { g_verbose = (params.verbosity == 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 42918bfc7..8edadef90 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "ggml.h" #include "llama.h" @@ -225,6 +226,9 @@ struct cmd_params { std::vector type_k; std::vector type_v; std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; std::vector n_gpu_layers; std::vector rpc_servers; std::vector split_mode; @@ -236,6 +240,8 @@ struct cmd_params { std::vector embeddings; ggml_numa_strategy numa; int reps; + ggml_sched_priority prio; + int delay; bool verbose; output_formats output_format; output_formats output_format_stderr; @@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = { /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {cpu_get_num_math()}, + /* cpu_mask */ {"0x0"}, + /* cpu_strict */ {false}, + /* poll */ {50}, /* n_gpu_layers */ {99}, /* rpc_servers */ {""}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, @@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = { /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, + /* prio */ GGML_SCHED_PRIO_NORMAL, + /* delay */ 0, /* verbose */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, @@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); @@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); + printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); @@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); + } else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); + } else if (arg == "--cpu-strict") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); + } else if (arg == "--poll") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.poll.insert(params.poll.end(), p.begin(), p.end()); } else if (arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; @@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.reps = std::stoi(argv[i]); + } else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); + } else if (arg == "--delay") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.delay = std::stoi(argv[i]); } else if (arg == "-o" || arg == "--output") { if (++i >= argc) { invalid_param = true; @@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } + if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } + if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } + if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } return params; } @@ -598,6 +652,9 @@ struct cmd_params_instance { ggml_type type_k; ggml_type type_v; int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; int n_gpu_layers; std::string rpc_servers; llama_split_mode split_mode; @@ -667,7 +724,10 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tv : params.type_v) for (const auto & nkvo : params.no_kv_offload) for (const auto & fa : params.flash_attn) - for (const auto & nt : params.n_threads) { + for (const auto & nt : params.n_threads) + for (const auto & cm : params.cpu_mask) + for (const auto & cs : params.cpu_strict) + for (const auto & pl : params.poll) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -681,6 +741,9 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -707,6 +770,9 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -733,6 +799,9 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -769,6 +838,9 @@ struct test { int n_batch; int n_ubatch; int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; bool has_rpc; ggml_type type_k; ggml_type type_v; @@ -795,6 +867,9 @@ struct test { n_batch = inst.n_batch; n_ubatch = inst.n_ubatch; n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; has_rpc = !inst.rpc_servers.empty(); type_k = inst.type_k; type_v = inst.type_v; @@ -872,13 +947,14 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", - "n_threads", "type_k", "type_v", + "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts" + "avg_ts", "stddev_ts", }; return fields; } @@ -887,7 +963,7 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || + field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || @@ -896,6 +972,7 @@ struct test { } if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } @@ -928,7 +1005,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), + std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), + ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), @@ -1067,7 +1145,7 @@ struct markdown_printer : public printer { return -30; } if (field == "t/s") { - return 16; + return 20; } if (field == "size" || field == "params") { return 10; @@ -1149,6 +1227,15 @@ struct markdown_printer : public printer { if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); } + if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) { + fields.emplace_back("cpu_mask"); + } + if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) { + fields.emplace_back("cpu_strict"); + } + if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) { + fields.emplace_back("poll"); + } if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { fields.emplace_back("n_batch"); } @@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); + set_process_priority(params.prio); + // initialize printer std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); @@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); + // cool off before the test + if (params.delay) { + std::this_thread::sleep_for(std::chrono::seconds(params.delay)); + } + + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); + if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) { + LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); + exit(1); + } + tpp.strict_cpu = t.cpu_strict; + tpp.poll = t.poll; + tpp.prio = params.prio; + + struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + + llama_attach_threadpool(ctx, threadpool, NULL); + // warmup run if (t.n_prompt > 0) { //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); @@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + + ggml_threadpool_free(threadpool); } llama_free_model(lmodel); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 58c32ca53..48b7840ae 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -71,8 +71,8 @@ actor LlamaContext { var ctx_params = llama_context_default_params() ctx_params.seed = 1234 ctx_params.n_ctx = 2048 - ctx_params.n_threads = UInt32(n_threads) - ctx_params.n_threads_batch = UInt32(n_threads) + ctx_params.n_threads = Int32(n_threads) + ctx_params.n_threads_batch = Int32(n_threads) let context = llama_new_context_with_model(model, ctx_params) guard let context else { diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8c7dd2ae3..86b39f20e 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para if (!params->image.empty()) { LOG_TEE("using base64 encoded image instead of command line image path\n"); } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { LOG_TEE("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embed) { fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 379fc295f..f500ea5b9 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling, static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ auto ctx_clip = clip_init_context(params); - auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; return NULL; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4a342ad03..2c05afb04 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -221,6 +221,40 @@ int main(int argc, char ** argv) { return 1; } + LOG("%s: llama threadpool init = n_threads = %d\n", + __func__, + (int) params.cpuparams.n_threads + ); + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + set_process_priority(params.cpuparams.priority); + + struct ggml_threadpool * threadpool_batch = NULL; + if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { + threadpool_batch = ggml_threadpool_new(&tpp_batch); + if (!threadpool_batch) { + LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + exit(1); + } + + // Start the non-batch threadpool in the paused state + tpp.paused = true; + } + + struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + + llama_attach_threadpool(ctx, threadpool, threadpool_batch); + if (ctx_guidance) { + llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch); + } + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); LOG("n_ctx: %d\n", n_ctx); @@ -989,6 +1023,9 @@ int main(int argc, char ** argv) { llama_sampling_free(ctx_sampling); llama_backend_free(); + ggml_threadpool_free(threadpool); + ggml_threadpool_free(threadpool_batch); + #ifndef LOG_DISABLE_LOGS LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c37182fe4..cc938e80d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) { }); LOG_INFO("system info", { - {"n_threads", params.n_threads}, - {"n_threads_batch", params.n_threads_batch}, + {"n_threads", params.cpuparams.n_threads}, + {"n_threads_batch", params.cpuparams_batch.n_threads}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b051a18f1..1616edecb 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -73,10 +73,11 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.n_threads_draft > 0) { - params.n_threads = params.n_threads_draft; + if (params.draft_cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.draft_cpuparams.n_threads; } - params.n_threads_batch = params.n_threads_batch_draft; + + params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; llama_init_result llama_init_dft = llama_init_from_gpt_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 434c13b34..0dff47d65 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -7,8 +7,8 @@ extern "C" { #endif typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; -typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; // Tensor allocator struct ggml_tallocr { diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index e73b9a745..e497b6d02 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -103,6 +103,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b11d047ae..5233a9995 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -231,6 +231,8 @@ #define GGML_MAX_SRC 10 #ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 +#define GGML_MAX_N_THREADS 512 + #endif #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 @@ -628,6 +630,29 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); + // Scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // Threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -635,6 +660,7 @@ extern "C" { uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; + struct ggml_threadpool * threadpool; // abort ggml_graph_compute when true ggml_abort_callback abort_callback; @@ -2057,10 +2083,23 @@ extern "C" { GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); + GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, /* = GGML_DEFAULT_N_THREADS */ + struct ggml_threadpool * threadpool /* = NULL */ ); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ff84b9bb5..ec7d30825 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1247,7 +1247,7 @@ endif() # Data types, macros and functions related to controlling CPU affinity and # some memory allocation are available on Linux through GNU extensions in libc -if (CMAKE_SYSTEM_NAME MATCHES "Linux") +if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android") add_compile_definitions(_GNU_SOURCE) endif() diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 8856967c9..5b877db35 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { #endif struct ggml_backend_cpu_context { - int n_threads; - void * work_data; - size_t work_size; + int n_threads; + ggml_threadpool_t threadpool; + + void * work_data; + size_t work_size; ggml_abort_callback abort_callback; void * abort_callback_data; @@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; @@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + + if (ctx->threadpool && ctx->threadpool != threadpool) { + // already had a different threadpool, pause/suspend it before switching + ggml_threadpool_pause(ctx->threadpool); + } + ctx->threadpool = threadpool; +} + void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9c105fd35..dc6cdca0b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -69,23 +69,42 @@ int ggml_sve_cnt_b = 0; #endif #include +#if !defined(__clang__) typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; typedef atomic_int atomic_flag; #define ATOMIC_FLAG_INIT 0 +typedef enum { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst +} memory_order; + static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } +static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) { + // TODO: add support for explicit memory order + InterlockedExchange(ptr, val); +} static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); } +static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) { + // TODO: add support for explicit memory order + return InterlockedCompareExchange(ptr, 0, 0); +} static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); } -static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { - return atomic_fetch_add(ptr, -(dec)); +static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) { + // TODO: add support for explicit memory order + return InterlockedExchangeAdd(ptr, inc); } static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { return InterlockedExchange(ptr, 1); @@ -93,6 +112,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { static void atomic_flag_clear(atomic_flag * ptr) { InterlockedExchange(ptr, 0); } +#else // clang +#include +#endif typedef HANDLE pthread_t; @@ -121,8 +143,10 @@ static int sched_yield (void) { return 0; } #else + #include #include +#include typedef void * thread_ret_t; @@ -1868,28 +1892,102 @@ struct ggml_context_container { struct ggml_context context; }; -struct ggml_compute_state_shared { - const struct ggml_cgraph * cgraph; - const struct ggml_cplan * cplan; +// +// Threading defs +// - int n_threads; +typedef pthread_t ggml_thread_t; + +#if defined(_WIN32) + +typedef CONDITION_VARIABLE ggml_cond_t; +typedef SRWLOCK ggml_mutex_t; + +#define ggml_mutex_init(m) InitializeSRWLock(m) +#define ggml_mutex_destroy(m) +#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m) +#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m) +#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m) +#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m) + +#define ggml_cond_init(c) InitializeConditionVariable(c) +#define ggml_cond_destroy(c) +#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED) +#define ggml_cond_broadcast(c) WakeAllConditionVariable(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +typedef pthread_cond_t ggml_cond_t; +typedef pthread_mutex_t ggml_mutex_t; + +#define ggml_mutex_init(m) pthread_mutex_init(m, NULL) +#define ggml_mutex_destroy(m) pthread_mutex_destroy(m) +#define ggml_mutex_lock(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock(m) pthread_mutex_unlock(m) +#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m) + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 +#define ggml_cond_init(c) pthread_cond_init(c, NULL) +#define ggml_cond_destroy(c) pthread_cond_destroy(c) +#define ggml_cond_wait(c, m) pthread_cond_wait(c, m) +#define ggml_cond_broadcast(c) pthread_cond_broadcast(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + +// Threadpool def +struct ggml_threadpool { + ggml_mutex_t mutex; // mutex for cond.var + ggml_cond_t cond; // cond.var for waiting for new work + + struct ggml_cgraph * cgraph; + struct ggml_cplan * cplan; // synchronization primitives + atomic_int n_graph; // incremented when there is work to be done (i.e each graph) atomic_int n_barrier; atomic_int n_barrier_passed; + atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. - ggml_abort_callback abort_callback; // abort ggml_graph_compute when true - void * abort_callback_data; + // these are atomic as an annotation for thread-sanitizer + atomic_bool stop; // Used for stopping the threadpool altogether + atomic_bool pause; // Used for pausing the threadpool or individual threads - atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads + struct ggml_compute_state * workers; // per thread state + int n_threads_max; // number of threads in the pool + int n_threads_cur; // number of threads used in the current graph + + int32_t prio; // Scheduling priority + uint32_t poll; // Polling level (0 - no polling) enum ggml_status ec; }; +// Per-thread state struct ggml_compute_state { +#ifndef GGML_USE_OPENMP ggml_thread_t thrd; + bool cpumask[GGML_MAX_N_THREADS]; + int last_graph; + bool pending; +#endif + struct ggml_threadpool * threadpool; int ith; - struct ggml_compute_state_shared * shared; }; struct ggml_compute_params { @@ -1900,7 +1998,7 @@ struct ggml_compute_params { size_t wsize; void * wdata; - struct ggml_compute_state_shared * shared; + struct ggml_threadpool * threadpool; }; // @@ -2971,6 +3069,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); +// Helpers for polling loops +#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) ) +static inline void ggml_thread_cpu_relax(void) { + __asm__ volatile("yield" ::: "memory"); +} +#elif defined(__x86_64__) +static inline void ggml_thread_cpu_relax(void) { + _mm_pause(); +} +#else +static inline void ggml_thread_cpu_relax(void) {;} +#endif + // // NUMA support // @@ -3018,42 +3129,36 @@ inline static void ggml_critical_section_start(void) { } #ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { +static void ggml_barrier(struct ggml_threadpool * threadpool) { + if (threadpool->n_threads_cur == 1) { return; } #pragma omp barrier } #else -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { +static void ggml_barrier(struct ggml_threadpool * threadpool) { + if (threadpool->n_threads_cur == 1) { return; } - atomic_int * n_barrier = &shared->n_barrier; - atomic_int * n_barrier_passed = &shared->n_barrier_passed; + atomic_int * n_barrier = &threadpool->n_barrier; + atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; - int n_threads = shared->n_threads; - int passed_old = atomic_load(n_barrier_passed); + int n_threads = threadpool->n_threads_cur; + int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed); if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { // last thread atomic_store(n_barrier, 0); - atomic_fetch_add(n_barrier_passed, 1); + atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed); } else { // wait for other threads - const int n_spin_before_sleep = 100000; while (true) { - for (int i = 0; i < n_spin_before_sleep; i++) { - if (atomic_load(n_barrier_passed) != passed_old) { - return; - } - #if defined(__SSE3__) - _mm_pause(); - #endif + if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) { + return; } - sched_yield(); + ggml_thread_cpu_relax(); } } } @@ -10148,7 +10253,7 @@ static void ggml_compute_forward_acc_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } const int ith = params->ith; @@ -12622,10 +12727,10 @@ UseGgmlGemm1:; if (ith == 0) { // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store(¶ms->shared->current_chunk, nth); + atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { @@ -12733,7 +12838,7 @@ UseGgmlGemm2:; break; } - current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1); + current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); } } @@ -12828,7 +12933,7 @@ static void ggml_compute_forward_mul_mat_id( } } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { @@ -12982,7 +13087,7 @@ static void ggml_compute_forward_out_prod_f32( if (ith == 0) { ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // dst[:,:,:,:] = 0 // for i2,i3: @@ -13100,7 +13205,7 @@ static void ggml_compute_forward_out_prod_q_f32( if (ith == 0) { ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // parallelize by last three dimensions @@ -13286,7 +13391,7 @@ static void ggml_compute_forward_set_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } const int ith = params->ith; @@ -13865,7 +13970,7 @@ static void ggml_compute_forward_diag_mask_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } // TODO: handle transposed/permuted matrices @@ -14641,7 +14746,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -14729,7 +14834,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -15109,7 +15214,7 @@ static void ggml_compute_forward_conv_transpose_2d( memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t stride = ggml_get_op_params_i32(dst, 0); @@ -15977,7 +16082,7 @@ static void ggml_compute_forward_flash_attn_back_f32( if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int64_t elem_q = ggml_nelements(q); const int64_t elem_k = ggml_nelements(k); @@ -16668,7 +16773,7 @@ static void ggml_compute_forward_add_rel_pos_f32( if (params->ith == 0) { memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 @@ -16953,7 +17058,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -16994,7 +17099,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( } #endif } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); if (ith == 0) { float * dp = (float *) dst->data; @@ -18810,65 +18915,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { ggml_hash_set_reset(&cgraph->visited_hash_set); } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// - -#ifdef __APPLE__ - -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#endif - // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -19149,9 +19195,268 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { return n_tasks; } -struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { +static thread_ret_t ggml_graph_compute_secondary_thread(void* data); + +#if defined(_WIN32) +#include "windows.h" + +// TODO: support > 64 CPUs +bool ggml_thread_apply_affinity(bool * mask) { + HANDLE h = GetCurrentThread(); + uint64_t bitmask = 0ULL; + + assert(GGML_MAX_N_THREADS >= 64); + + for (int32_t i = 0; i < 8; i++) { + int32_t idx = i * 8; + uint8_t val = 0; + val |= mask[idx + 0] << 0; + val |= mask[idx + 1] << 1; + val |= mask[idx + 2] << 2; + val |= mask[idx + 3] << 3; + val |= mask[idx + 4] << 4; + val |= mask[idx + 5] << 5; + val |= mask[idx + 6] << 6; + val |= mask[idx + 7] << 7; + bitmask |= (uint64_t)val << idx; + } + + for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n"); + break; + } + } + + DWORD_PTR m = (DWORD_PTR)bitmask; + + m = SetThreadAffinityMask(h, m); + + return m != 0; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + // Note that on Windows the Process Priority Class must be updated in order to set Thread priority. + // This is up to the applications. + DWORD p = THREAD_PRIORITY_NORMAL; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; + case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; + case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; + case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + if (!SetThreadPriority(GetCurrentThread(), p)) { + fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + + return true; +} + +#elif defined(__APPLE__) +#include +#include + +static bool ggml_thread_apply_affinity(const bool * mask) { + // Not supported on Apple platforms + UNUSED(mask); + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); + return false; + } + + return true; +} + +#else // posix? + +static bool ggml_thread_apply_affinity(const bool * mask) { + cpu_set_t cpuset; + int err; + + CPU_ZERO(&cpuset); + + for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + CPU_SET(i, &cpuset); + } + } + +#ifdef __ANDROID__ + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + err = errno; + } +#else + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); +#endif + if (err != 0) { + fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err); + return false; + } + + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); + return false; + } + + return true; +} + +#endif + +static bool ggml_thread_cpumask_is_valid(const bool * mask) { + for (int i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { return true; } + } + return false; +} + +static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { + if (!strict) { + memcpy(local_mask, global_mask, GGML_MAX_N_THREADS); + return; + } else { + memset(local_mask, 0, GGML_MAX_N_THREADS); + int32_t base_idx = *iter; + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + int32_t idx = base_idx + i; + if (idx >= GGML_MAX_N_THREADS) { + // Just a cheaper modulo + idx -= GGML_MAX_N_THREADS; + } + if (global_mask[idx]) { + local_mask[idx] = 1; + *iter = idx + 1; + return; + } + } + } +} + +void ggml_threadpool_free(struct ggml_threadpool* threadpool) { + if (!threadpool) return; + +#ifndef GGML_USE_OPENMP + struct ggml_compute_state* workers = threadpool->workers; + const int n_threads = threadpool->n_threads_max; + + ggml_mutex_lock(&threadpool->mutex); + + threadpool->stop = true; + threadpool->pause = false; + + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); + + for (int j = 1; j < n_threads; j++) { + int32_t rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); + UNUSED(rc); + } + + ggml_mutex_destroy(&threadpool->mutex); + ggml_cond_destroy(&threadpool->cond); +#endif // GGML_USE_OPENMP + + GGML_ALIGNED_FREE(threadpool->workers); + GGML_ALIGNED_FREE(threadpool); +} + +#ifndef GGML_USE_OPENMP +// pause/resume must be called under mutex +static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) { + GGML_PRINT_DEBUG("Pausing threadpool\n"); + threadpool->pause = true; + ggml_cond_broadcast(&threadpool->cond); +} + +static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) { + GGML_PRINT_DEBUG("Resuming threadpool\n"); + threadpool->pause = false; + ggml_cond_broadcast(&threadpool->cond); +} +#endif + +void ggml_threadpool_pause(struct ggml_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + ggml_mutex_lock(&threadpool->mutex); + if (!threadpool->pause) { + ggml_threadpool_pause_locked(threadpool); + } + ggml_mutex_unlock(&threadpool->mutex); +#else + UNUSED(threadpool); +#endif +} + +void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + ggml_mutex_lock(&threadpool->mutex); + if (threadpool->pause) { + ggml_threadpool_resume_locked(threadpool); + } + ggml_mutex_unlock(&threadpool->mutex); +#else + UNUSED(threadpool); +#endif +} + +struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, + struct ggml_threadpool * threadpool) { + + if (threadpool == NULL) { + GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); + } if (n_threads <= 0) { - n_threads = GGML_DEFAULT_N_THREADS; + n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; } size_t work_size = 0; @@ -19307,12 +19612,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } if (work_size > 0) { - work_size += CACHE_LINE_SIZE*(n_threads - 1); + work_size += CACHE_LINE_SIZE*(n_threads); } - cplan.n_threads = MIN(max_tasks, n_threads); - cplan.work_size = work_size; - cplan.work_data = NULL; + cplan.threadpool = threadpool; + cplan.n_threads = MIN(max_tasks, n_threads); + cplan.work_size = work_size; + cplan.work_data = NULL; return cplan; } @@ -19320,17 +19626,17 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - const struct ggml_cgraph * cgraph = state->shared->cgraph; - const struct ggml_cplan * cplan = state->shared->cplan; + const struct ggml_cgraph * cgraph = state->threadpool->cgraph; + const struct ggml_cplan * cplan = state->threadpool->cplan; set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ state->shared->n_threads, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.shared=*/ state->shared, + /*.ith =*/ state->ith, + /*.nth =*/ state->threadpool->n_threads_cur, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + /*.threadpool=*/ state->threadpool, }; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -19339,12 +19645,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->ec = GGML_STATUS_ABORTED; + state->threadpool->ec = GGML_STATUS_ABORTED; } - ggml_barrier(state->shared); + ggml_barrier(state->threadpool); - if (state->shared->ec != GGML_STATUS_SUCCESS) { + if (state->threadpool->ec != GGML_STATUS_SUCCESS) { break; } } @@ -19352,24 +19658,243 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } +#ifndef GGML_USE_OPENMP + +static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + if (state->pending || threadpool->stop || threadpool->pause) { return true; } + + // check for new graph/work + int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); + if (new_graph != state->last_graph) { + state->pending = (state->ith < threadpool->n_threads_cur); + state->last_graph = new_graph; + } + + return state->pending; +} + +static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + // This seems to make 0 ... 100 a decent range for polling level across modern processors. + // Perhaps, we can adjust it dynamically based on load and things. + const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; + + for (uint64_t i=0; !ggml_graph_compute_ready(state) && ipending; +} + +static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + if (ggml_graph_compute_poll_for_work(state)) { + return state->pending; + } + + ggml_mutex_lock_shared(&threadpool->mutex); + while (!ggml_graph_compute_ready(state)) { + // No new work. Wait for the signal. + GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith); + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + ggml_mutex_unlock_shared(&threadpool->mutex); + + return state->pending; +} + +static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * threadpool = state->threadpool; + + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(state->cpumask)) { + ggml_thread_apply_affinity(state->cpumask); + } + + while (true) { + // Check if we need to sleep + while (threadpool->pause) { + GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith); + ggml_mutex_lock_shared(&threadpool->mutex); + if (threadpool->pause) { + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith); + ggml_mutex_unlock_shared(&threadpool->mutex); + } + + // This needs to be checked for after the cond_wait + if (threadpool->stop) break; + + // Check if there is new work + // The main thread is the only one that can dispatch new work + + ggml_graph_compute_check_for_work(state); + if (state->pending) { + state->pending = false; + + ggml_graph_compute_thread(state); + } + } + + return (thread_ret_t) 0; +} + +// Start processing new graph +static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool) +{ + // always take the mutex here because the worker threads are doing hybrid poll/wait + + ggml_mutex_lock(&threadpool->mutex); + + atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed); + + if (threadpool->pause) { + // Update main thread prio and affinity to match the threadpool settings + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { + ggml_thread_apply_affinity(threadpool->workers[0].cpumask); + } + + // resume does cond broadcast + ggml_threadpool_resume_locked(threadpool); + } else { + ggml_cond_broadcast(&threadpool->cond); + } + + ggml_mutex_unlock(&threadpool->mutex); +} + +#endif // GGML_USE_OPENMP + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} + +static struct ggml_threadpool * ggml_threadpool_new_impl( + struct ggml_threadpool_params * tpp, + struct ggml_cgraph * cgraph, + struct ggml_cplan * cplan) { + + struct ggml_threadpool * threadpool = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); + { + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_graph = 0; + threadpool->n_barrier = 0; + threadpool->n_barrier_passed = 0; + threadpool->current_chunk = 0; + threadpool->stop = false; + threadpool->pause = tpp->paused; + threadpool->workers = NULL; + threadpool->n_threads_max = tpp->n_threads; + threadpool->n_threads_cur = tpp->n_threads; + threadpool->poll = tpp->poll; + threadpool->prio = tpp->prio; + threadpool->ec = GGML_STATUS_SUCCESS; + } + + // Allocate and init workers state + const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; + struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); + + memset(workers, 0, workers_size); + for (int j = 0; j < tpp->n_threads; j++) { + workers[j].threadpool = threadpool; + workers[j].ith = j; + } + + threadpool->workers = workers; + +#ifndef GGML_USE_OPENMP + ggml_mutex_init(&threadpool->mutex); + ggml_cond_init(&threadpool->cond); + + // Spin the threads for all workers, and update CPU placements. + // Place the main thread last (towards the higher numbered CPU cores). + + int32_t cpumask_iter = 0; + + for (int j = 1; j < tpp->n_threads; j++) { + ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + + int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]); + GGML_ASSERT(rc == 0); + } + + ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter); + + if (!threadpool->pause) { + // Update main thread prio and affinity at the start, otherwise we'll do it in resume + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { + ggml_thread_apply_affinity(threadpool->workers[0].cpumask); + } + } +#endif // GGML_USE_OPENMP + + return threadpool; +} + +struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) { + return ggml_threadpool_new_impl(tpp, NULL, NULL); +} + enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); - int n_threads = cplan->n_threads; + int n_threads = cplan->n_threads; + struct ggml_threadpool * threadpool = cplan->threadpool; - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ cplan, - /*.n_threads =*/ n_threads, - /*.n_barrier =*/ 0, - /*.n_barrier_passed =*/ 0, - /*.abort_callback =*/ NULL, - /*.abort_callback_data =*/ NULL, - /*.current_chunk =*/ 0, - /*.ec =*/ GGML_STATUS_SUCCESS, - }; + bool disposable_threadpool = false; + + if (threadpool == NULL) { + GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); + disposable_threadpool = true; + + struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); + threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan); + } else { + // Reset some of the parameters that need resetting + // No worker threads should be accessing the parameters below at this stage + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_threads_cur = n_threads; + threadpool->current_chunk = 0; + threadpool->ec = GGML_STATUS_SUCCESS; + } + + if (n_threads > threadpool->n_threads_max) { + GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); + } #ifdef GGML_USE_OPENMP if (n_threads > 1) { @@ -19379,63 +19904,36 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl { // update the number of threads from the actual number of threads that we got from OpenMP n_threads = omp_get_num_threads(); - state_shared.n_threads = n_threads; + threadpool->n_threads_cur = n_threads; } - struct ggml_compute_state worker = { - .thrd = 0, - .ith = omp_get_thread_num(), - .shared = &state_shared, - }; - ggml_graph_compute_thread(&worker); + ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); } } else { - struct ggml_compute_state worker = { - .thrd = 0, - .ith = 0, - .shared = &state_shared, - }; - ggml_graph_compute_thread(&worker); + ggml_graph_compute_thread(&threadpool->workers[0]); } #else - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // Kick all threads to start the new graph + ggml_graph_compute_kickoff(threadpool); - for (int j = 0; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - }; - } - - // create thread pool - for (int j = 1; j < n_threads; ++j) { - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); - - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - } + // This is a work thread too + ggml_graph_compute_thread(&threadpool->workers[0]); #endif // don't leave affinity set on the main thread clear_numa_thread_affinity(); - return state_shared.ec; + enum ggml_status ret = threadpool->ec; + + if (disposable_threadpool) { + ggml_threadpool_free(threadpool); + } + + return ret; } enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); @@ -20251,7 +20749,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -20598,7 +21096,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; diff --git a/include/llama.h b/include/llama.h index 6cca6320b..c3bda9e02 100644 --- a/include/llama.h +++ b/include/llama.h @@ -304,8 +304,8 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + int32_t n_threads; // number of threads to use for generation + int32_t n_threads_batch; // number of threads to use for batch processing enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id @@ -428,6 +428,13 @@ extern "C" { //optional: LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); + // Optional: an auto threadpool gets created in ggml if not passed explicitly + LLAMA_API void llama_attach_threadpool( + struct llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch); + LLAMA_API void llama_detach_threadpool(struct llama_context * ctx); + // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); @@ -837,13 +844,13 @@ extern "C" { // Set the number of threads used for decoding // n_threads is the number of threads used for generation (single token) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) - LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch); // Get the number of threads used for generation of a single token. - LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); + LLAMA_API int32_t llama_n_threads(struct llama_context * ctx); // Get the number of threads used for prompt and batch processing (multiple token). - LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx); // Set whether the model is in embeddings mode or not // If true, embeddings will be returned but logits will not diff --git a/src/llama.cpp b/src/llama.cpp index 8d5f24783..2274296b4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2373,8 +2373,8 @@ struct llama_cparams { uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + int n_threads; // number of threads to use for generation + int n_threads_batch; // number of threads to use for batch processing float rope_freq_base; float rope_freq_scale; @@ -3091,6 +3091,9 @@ struct llama_context { #endif ggml_backend_t backend_cpu = nullptr; + ggml_threadpool_t threadpool = nullptr; + ggml_threadpool_t threadpool_batch = nullptr; + bool has_evaluated_once = false; int64_t t_start_us; @@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) { } static void llama_graph_compute( - llama_context & lctx, - ggml_cgraph * gf, - int n_threads) { + llama_context & lctx, + ggml_cgraph * gf, + int n_threads, + ggml_threadpool * threadpool) { #ifdef GGML_USE_METAL if (ggml_backend_is_metal(lctx.backend_metal)) { ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); @@ -15505,6 +15509,7 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } #ifdef GGML_USE_BLAS @@ -15625,6 +15630,8 @@ static int llama_decode_internal( } int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; + GGML_ASSERT(n_threads > 0); // non-causal masks do not use the KV cache @@ -15686,7 +15693,7 @@ static int llama_decode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads); + llama_graph_compute(lctx, gf, n_threads, threadpool); // update the kv ring buffer { @@ -15863,7 +15870,9 @@ static int llama_encode_internal( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; + GGML_ASSERT(n_threads > 0); ggml_backend_sched_reset(lctx.sched); @@ -15895,7 +15904,7 @@ static int llama_encode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads); + llama_graph_compute(lctx, gf, n_threads, threadpool); // extract embeddings if (embd) { @@ -16177,7 +16186,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); #endif //const int64_t t_end = ggml_time_us(); @@ -16203,7 +16212,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_set_k_shift(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -17451,6 +17460,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) { } } +void llama_attach_threadpool( + struct llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + ctx->threadpool = threadpool; + ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; +} + +void llama_detach_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; + ctx->threadpool_batch = nullptr; +} + void llama_backend_free(void) { ggml_quantize_free(); } @@ -19367,16 +19389,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa } } -void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { +void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { ctx->cparams.n_threads = n_threads; ctx->cparams.n_threads_batch = n_threads_batch; } -uint32_t llama_n_threads(struct llama_context * ctx) { +int32_t llama_n_threads(struct llama_context * ctx) { return ctx->cparams.n_threads; } -uint32_t llama_n_threads_batch(struct llama_context * ctx) { +int32_t llama_n_threads_batch(struct llama_context * ctx) { return ctx->cparams.n_threads_batch; } diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 8159e276a..246bb227d 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32( } static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size);