From 02182fc5b9b0d20c8a4fc15a6b8637dd65d3b537 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 12 Jun 2026 15:57:05 +0300
Subject: [PATCH] fit : avoid including llama-ext.h in fit.h (#24506)

---
 common/fit.h    | 48 +++++++++++++++++++++++++-----------------------
 src/llama-ext.h |  1 +
 2 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/common/fit.h b/common/fit.h
index 643d342009..a6ba71b89e 100644
--- a/common/fit.h
+++ b/common/fit.h
@@ -1,9 +1,7 @@
 #pragma once
 
 #include "ggml.h"
-#include "ggml-backend.h"
 #include "llama.h"
-#include "../src/llama-ext.h"
 
 #include <vector>
 
@@ -18,31 +16,35 @@ enum common_params_fit_status {
 //   - this function is NOT thread safe because it modifies the global llama logger state
 //   - only parameters that have the same value as in llama_default_model_params are modified
 //     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+common_params_fit_status common_fit_params(
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams,
+                              float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+   llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                             size_t * margins,               // margins of memory to leave per device in bytes
+                           uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                     ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
 
 // print estimated memory to stdout
 void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
+                         const char * path_model,
+                 llama_model_params * mparams,
+               llama_context_params * cparams);
 
-void common_memory_breakdown_print(const struct llama_context * ctx);
+void common_memory_breakdown_print(const llama_context * ctx);
+
+// TODO: convert this to common_device_memory_data that wraps llama_device_memory_data
+//       add API for accessing the internal `llama-ext.h` information
+struct llama_device_memory_data;
 
 // Load a model + context with no_alloc and return the per-device memory breakdown.
 std::vector<llama_device_memory_data> common_get_device_memory_data(
-                                  const char   * path_model,
-        const struct llama_model_params         * mparams,
-        const struct llama_context_params       * cparams,
-        std::vector<ggml_backend_dev_t>         & devs,
-                                      uint32_t  & hp_ngl,
-                                      uint32_t  & hp_n_ctx_train,
-                                      uint32_t  & hp_n_expert,
-                           enum ggml_log_level    log_level);
+                         const char * path_model,
+           const llama_model_params * mparams,
+         const llama_context_params * cparams,
+    std::vector<ggml_backend_dev_t> & devs,
+                           uint32_t & hp_ngl,
+                           uint32_t & hp_n_ctx_train,
+                           uint32_t & hp_n_expert,
+                     ggml_log_level   log_level);
diff --git a/src/llama-ext.h b/src/llama-ext.h
index b744af5286..8b5679b690 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -2,6 +2,7 @@
 
 // this is a staging header for new llama.cpp API
 // breaking changes and C++ are allowed. everything here should be considered WIP
+// try as much as possible to not include this header in the rest of the codebase
 
 #include "llama.h"