From f5feac831fe225ed7f3db938d115732a49dccfc4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Nov 2023 17:19:35 +0200 Subject: [PATCH] llama : fix data units ggml-ci --- ggml-cuda.cu | 4 ++-- ggml-metal.m | 18 +++++++++--------- llama.cpp | 40 ++++++++++++++++++++-------------------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c0c9edd56..b4be38f7d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5841,7 +5841,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { } #ifdef DEBUG_CUDA_MALLOC fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024)); + (uint32_t)(max_size/1e6), (uint32_t)(tot_size/1e6), (uint32_t)(size/1e6)); #endif void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); @@ -5979,7 +5979,7 @@ void * ggml_cuda_host_malloc(size_t size) { // This can fixed the OOM error in WSL. cudaGetLastError(); fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", - size/1024.0/1024.0, cudaGetErrorString(err)); + size/1e6, cudaGetErrorString(err)); return nullptr; } diff --git a/ggml-metal.m b/ggml-metal.m index 3d22b0b27..4fe9cc487 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { } GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); if (ctx->device.maxTransferRate != 0) { - GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); } else { GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } @@ -541,11 +541,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6); return false; } - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6); ++ctx->n_buffers; } else { @@ -565,11 +565,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6); return false; } - GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i); if (i + size_step < size) { GGML_METAL_LOG_INFO("\n"); } @@ -580,8 +580,8 @@ bool ggml_metal_add_buffer( #if TARGET_OS_OSX GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", - ctx->device.currentAllocatedSize / 1024.0 / 1024.0, - ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + ctx->device.currentAllocatedSize / 1e6, + ctx->device.recommendedMaxWorkingSetSize / 1e6); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); @@ -589,7 +589,7 @@ bool ggml_metal_add_buffer( GGML_METAL_LOG_INFO("\n"); } #else - GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6); #endif } diff --git a/llama.cpp b/llama.cpp index 92c4536cb..e24f23655 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1083,9 +1083,9 @@ enum e_model { MODEL_70B, }; -static const size_t kB = 1024; -static const size_t MB = 1024*kB; -static const size_t GB = 1024*MB; +static const size_t kB = 1000; +static const size_t MB = 1000*kB; +static const size_t GB = 1000*MB; struct llama_hparams { bool vocab_only; @@ -1481,7 +1481,7 @@ static bool llama_kv_cache_init( vram_kv_cache += ggml_nbytes(cache.k); } if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1e6); } } #endif @@ -2520,9 +2520,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); if (ml.n_bytes < GB) { - LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.2f MB (%.2f BPW) \n", __func__, ml.n_bytes/1e6, ml.n_bytes*8.0/ml.n_elements); } else { - LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.2f GB (%.2f BPW) \n", __func__, ml.n_bytes/1e9, ml.n_bytes*8.0/ml.n_elements); } // general kv @@ -2558,7 +2558,7 @@ static void llm_load_tensors( ml.calc_sizes(ctx_size, mmapped_size); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1e6); // create the ggml context { @@ -3207,7 +3207,7 @@ static void llm_load_tensors( ctx_size + mmapped_size - vram_weights; // weights in VRAM not in memory - LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1e6); #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3226,7 +3226,7 @@ static void llm_load_tensors( #endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1e6); #else (void) n_gpu_layers; #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) @@ -7878,7 +7878,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = tensor->type; new_data = tensor->data; new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1e6); } else { const size_t nelements = ggml_nelements(tensor); @@ -7938,7 +7938,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s workers.clear(); } - LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1e6, new_size/1e6); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -7976,8 +7976,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s gguf_free(ctx_out); - LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1e6); + LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1e6); // print histogram for all tensors { @@ -8478,7 +8478,7 @@ struct llama_context * llama_new_context_with_model( { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1e6); } // resized during inference @@ -8523,7 +8523,7 @@ struct llama_context * llama_new_context_with_model( // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1e6); // recreate allocator with exact memory requirements ggml_allocr_free(ctx->alloc); @@ -8537,7 +8537,7 @@ struct llama_context * llama_new_context_with_model( #endif #ifdef GGML_USE_CUBLAS ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1e6); // calculate total VRAM usage auto add_tensor = [](const ggml_tensor * t, size_t & size) { @@ -8558,9 +8558,9 @@ struct llama_context * llama_new_context_with_model( size_t total_vram_size = model_vram_size + ctx_vram_size; LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + total_vram_size / 1e6, + model_vram_size / 1e6, + ctx_vram_size / 1e6); #endif } @@ -8581,7 +8581,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1e6); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \