From 02182fc5b9b0d20c8a4fc15a6b8637dd65d3b537 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Jun 2026 15:57:05 +0300 Subject: [PATCH] fit : avoid including llama-ext.h in fit.h (#24506) --- common/fit.h | 48 +++++++++++++++++++++++++----------------------- src/llama-ext.h | 1 + 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/common/fit.h b/common/fit.h index 643d342009..a6ba71b89e 100644 --- a/common/fit.h +++ b/common/fit.h @@ -1,9 +1,7 @@ #pragma once #include "ggml.h" -#include "ggml-backend.h" #include "llama.h" -#include "../src/llama-ext.h" #include @@ -18,31 +16,35 @@ enum common_params_fit_status { // - this function is NOT thread safe because it modifies the global llama logger state // - only parameters that have the same value as in llama_default_model_params are modified // with the exception of the context size which is modified if and only if equal to 0 -enum common_params_fit_status common_fit_params( - const char * path_model, - struct llama_model_params * mparams, - struct llama_context_params * cparams, - float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements - struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements - size_t * margins, // margins of memory to leave per device in bytes - uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use - enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log +common_params_fit_status common_fit_params( + const char * path_model, + llama_model_params * mparams, + llama_context_params * cparams, + float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements + llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements + size_t * margins, // margins of memory to leave per device in bytes + uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use + ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log // print estimated memory to stdout void common_fit_print( - const char * path_model, - struct llama_model_params * mparams, - struct llama_context_params * cparams); + const char * path_model, + llama_model_params * mparams, + llama_context_params * cparams); -void common_memory_breakdown_print(const struct llama_context * ctx); +void common_memory_breakdown_print(const llama_context * ctx); + +// TODO: convert this to common_device_memory_data that wraps llama_device_memory_data +// add API for accessing the internal `llama-ext.h` information +struct llama_device_memory_data; // Load a model + context with no_alloc and return the per-device memory breakdown. std::vector common_get_device_memory_data( - const char * path_model, - const struct llama_model_params * mparams, - const struct llama_context_params * cparams, - std::vector & devs, - uint32_t & hp_ngl, - uint32_t & hp_n_ctx_train, - uint32_t & hp_n_expert, - enum ggml_log_level log_level); + const char * path_model, + const llama_model_params * mparams, + const llama_context_params * cparams, + std::vector & devs, + uint32_t & hp_ngl, + uint32_t & hp_n_ctx_train, + uint32_t & hp_n_expert, + ggml_log_level log_level); diff --git a/src/llama-ext.h b/src/llama-ext.h index b744af5286..8b5679b690 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -2,6 +2,7 @@ // this is a staging header for new llama.cpp API // breaking changes and C++ are allowed. everything here should be considered WIP +// try as much as possible to not include this header in the rest of the codebase #include "llama.h"