llama : fix platforms without mmap (#4578)

* llama : fix platforms without mmap * win32 : limit prefetch size to the file size * fix win32 error clobber, unnecessary std::string in std::runtime_error
2025-01-12 03:31:46 +00:00 · 2023-12-22 12:12:53 +01:00 · 2023-12-22 12:12:53 +01:00 · 48b7ff193e
commit 48b7ff193e
parent 48b24b170e
3 changed files with 24 additions and 21 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -7702,7 +7702,8 @@ inline void ggml_cuda_op_scale(
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    const float scale = ((float *) dst->op_params)[0];
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));

    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());
--- a/ggml.c
+++ b/ggml.c
@ -10335,7 +10335,8 @@ static void ggml_compute_forward_scale_f32(
    }

    // scale factor
-    const float v = *(float *) dst->op_params;
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;
@ -15152,7 +15153,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                // necessary for llama
                if (src0->grad) {
-                    const float s = ((float *) tensor->op_params)[0];
+                    float s;
+                    memcpy(&s, tensor->op_params, sizeof(float));

                    src0->grad =
                        ggml_add_or_set(ctx,
--- a/llama.cpp
+++ b/llama.cpp
@ -778,7 +778,7 @@ struct llama_file {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+            throw std::runtime_error("unexpectedly reached end of file");
        }
    }

@ -931,29 +931,29 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) numa;
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
+        GGML_UNUSED(numa);

        size = file->size;

        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));

        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();

        if (hMapping == NULL) {
+            DWORD error = GetLastError();
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }

        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
+        DWORD error = GetLastError();
        CloseHandle(hMapping);

        if (addr == NULL) {
            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }

-        if (prefetch) {
+        if (prefetch > 0) {
            // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@ -965,9 +965,9 @@ struct llama_mmap {
                // advise the kernel to preload the mapped memory
                WIN32_MEMORY_RANGE_ENTRY range;
                range.VirtualAddress = addr;
-                range.NumberOfBytes = (SIZE_T)size;
+                range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                    LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
                            llama_format_win_err(GetLastError()).c_str());
                }
            }
@ -982,26 +982,26 @@ struct llama_mmap {

    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+            LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;

-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) file;
-        (void) prefetch;
-        (void) numa;
+    llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
+        GGML_UNUSED(file);
+        GGML_UNUSED(prefetch);
+        GGML_UNUSED(numa);

-        throw std::runtime_error(std::string("mmap not supported"));
+        throw std::runtime_error("mmap not supported");
    }

-    void unmap(size_t offset, size_t len) {
-        (void) offset;
-        (void) len;
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);

-        throw std::runtime_error(std::string("mmap not supported"));
+        throw std::runtime_error("mmap not supported");
    }
 #endif
 };