faster ggml_cuda_host_malloc (#1988)

2026-06-28 04:30:15 -05:00 · 2026-06-18 09:01:34 +01:00 · 2026-06-18 09:01:34 +01:00 · 21f918c185
commit 21f918c185
parent f5e5753c32
1 changed files with 77 additions and 1 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -77,6 +77,10 @@
 #include <vector>
 #include <sstream>

+#ifdef __linux__
+#include <sys/mman.h>
+#endif
+
 #define IK_PRINT_TIMING 0

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
@ -1378,10 +1382,76 @@ GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_bu
    GGML_UNUSED(buffer);
 }

+#ifdef __linux__
 GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    CUDA_CHECK(cudaFreeHost(buffer->context));
+    CUDA_CHECK(cudaHostUnregister(buffer->context));
+    munmap(buffer->context, buffer->size);
 }

+static void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    constexpr double k_warn_limit = 8.0;
+    double size_GiB = size/(1024.*1024.*1024.);
+    auto tim1 = ggml_time_us();
+    if (size_GiB > k_warn_limit) {
+        GGML_CUDA_LOG_INFO("\n\nAllocating %.2f GiB of pinned host memory, this may take a while.\n", size_GiB);
+        GGML_CUDA_LOG_INFO("Using pinned host memory improves PP performance by a significant margin.\n");
+        GGML_CUDA_LOG_INFO("But if it takes too long for your model and amount of patience, kill the process and run using\n\n");
+        GGML_CUDA_LOG_INFO("GGML_CUDA_NO_PINNED=1 your_command_goes_here\n");
+    }
+
+    void * ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (ptr == MAP_FAILED) {
+        GGML_CUDA_LOG_WARN("%s: mmap of %.2f MiB failed\n", __func__, size/1024.0/1024.0);
+        return nullptr;
+    }
+
+// Whether to request the kernel to attempt to defragment memory to back the region with 2M hugepages.
+// Otherwise dependent on kernel settings:
+//   * enabled="always":  Hand over whatever 2M pages it has on hand and the rest will be 4k 
+//   * enabled="madvise": 4k pages
+//   * enabled="never":   4k pages
+// Potluck on performance. If there's not much defragmentation to do, then you win. Otherwise come back in an hour.
+#if 0
+#ifdef MADV_HUGEPAGE
+    madvise(ptr, size, MADV_HUGEPAGE);
+#endif
+#endif
+
+    // prefault the whole region. If the kernel knows how to do this then let it do so.
+    // Might be worth spawning threads to speed up this process on huge allocations.
+    int needs_manual_prefault = 1;
+#ifdef MADV_POPULATE_WRITE
+    needs_manual_prefault = madvise(ptr, size, MADV_POPULATE_WRITE);
+#endif
+    if (needs_manual_prefault)
+    {
+        char * p = (char *) ptr;
+        for (size_t off = 0; off < size; off += 4096) {
+            p[off] = 0;
+        }
+    }
+
+    cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterPortable);
+    if (err != cudaSuccess) {
+        cudaGetLastError(); // clear the error
+        GGML_CUDA_LOG_WARN("%s: cudaHostRegister of %.2f MiB failed: %s\n", __func__,
+                           size/1024.0/1024.0, cudaGetErrorString(err));
+        munmap(ptr, size);
+        return nullptr;
+    }
+
+    if (size_GiB > k_warn_limit) {
+        auto tim2 = ggml_time_us();
+        GGML_CUDA_LOG_INFO("    done allocating %.2f GiB in %.1f ms\n\n", size_GiB, 1e-3*(tim2-tim1));
+    }
+    return ptr;
+}
+
+#else // !__linux__
 static void * ggml_cuda_host_malloc(size_t size) {
    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
        return nullptr;
@ -1413,6 +1483,12 @@ static void * ggml_cuda_host_malloc(size_t size) {
    return ptr;
 }

+GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    CUDA_CHECK(cudaFreeHost(buffer->context));
+}
+
+#endif // __linux__
+
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    void * ptr = ggml_cuda_host_malloc(size);