app : add batched-bench, fit-params, quantize & perplexity (#23459)

* app : add batched-bench, fit-params, quantize & perplexity

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add missing main.cpp

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add EOL

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2026-05-21 09:29:44 +02:00 committed by GitHub
parent 12e5d99078
commit 1d7ab2b947
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 118 additions and 20 deletions

View File

@ -3,7 +3,16 @@ set(TARGET llama-app)
add_executable(${TARGET} llama.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
target_link_libraries(${TARGET} PRIVATE llama-server-impl llama-cli-impl llama-completion-impl llama-bench-impl)
target_link_libraries(${TARGET} PRIVATE
llama-server-impl
llama-cli-impl
llama-completion-impl
llama-bench-impl
llama-batched-bench-impl
llama-fit-params-impl
llama-quantize-impl
llama-perplexity-impl
)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@ -4,12 +4,18 @@
#include <string>
#include <vector>
// visible
int llama_server(int argc, char ** argv);
int llama_cli(int argc, char ** argv);
// hidden
int llama_completion(int argc, char ** argv);
int llama_bench(int argc, char ** argv);
int llama_batched_bench(int argc, char ** argv);
int llama_fit_params(int argc, char ** argv);
int llama_quantize(int argc, char ** argv);
int llama_perplexity(int argc, char ** argv);
static int help(int argc, char ** argv);
static int version(int argc, char ** argv);
@ -22,12 +28,16 @@ struct command {
};
static const command cmds[] = {
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmarking tool", {}, true, llama_bench },
{"version", "Show version", {}, true, version },
{"help", "Show available commands", {}, true, help },
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, true, version },
{"help", "Show available commands", {}, true, help },
};
static int version(int argc, char ** argv) {

View File

@ -1,6 +1,18 @@
# llama-batched-bench-impl: batched-bench logic, reusable by app
set(TARGET llama-batched-bench-impl)
add_library(${TARGET} STATIC batched-bench.cpp)
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
# llama-batched-bench executable
set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp)
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE llama-batched-bench-impl)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@ -15,7 +15,10 @@ static void print_usage(int, char ** argv) {
LOG("\n");
}
int main(int argc, char ** argv) {
// satisfies -Wmissing-declarations
int llama_batched_bench(int argc, char ** argv);
int llama_batched_bench(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;

View File

@ -0,0 +1,5 @@
int llama_batched_bench(int argc, char ** argv);
int main(int argc, char ** argv) {
return llama_batched_bench(argc, argv);
}

View File

@ -1,6 +1,18 @@
# llama-fit-params-impl: fit-params logic, reusable by app
set(TARGET llama-fit-params-impl)
add_library(${TARGET} STATIC fit-params.cpp)
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
# llama-fit-params executable
set(TARGET llama-fit-params)
add_executable(${TARGET} fit-params.cpp)
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE llama-fit-params-impl)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@ -12,7 +12,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int main(int argc, char ** argv) {
// satisfies -Wmissing-declarations
int llama_fit_params(int argc, char ** argv);
int llama_fit_params(int argc, char ** argv) {
common_params params;
common_init();

View File

@ -0,0 +1,5 @@
int llama_fit_params(int argc, char ** argv);
int main(int argc, char ** argv) {
return llama_fit_params(argc, argv);
}

View File

@ -1,6 +1,18 @@
# llama-perplexity-impl: perplexity logic, reusable by app
set(TARGET llama-perplexity-impl)
add_library(${TARGET} STATIC perplexity.cpp)
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
# llama-perplexity executable
set(TARGET llama-perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE llama-perplexity-impl)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

View File

@ -0,0 +1,5 @@
int llama_perplexity(int argc, char ** argv);
int main(int argc, char ** argv) {
return llama_perplexity(argc, argv);
}

View File

@ -2005,7 +2005,10 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
}
int main(int argc, char ** argv) {
// satisfies -Wmissing-declarations
int llama_perplexity(int argc, char ** argv);
int llama_perplexity(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;

View File

@ -1,7 +1,18 @@
# llama-quantize-impl: quantize logic, reusable by app
set(TARGET llama-quantize-impl)
add_library(${TARGET} STATIC quantize.cpp)
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
# llama-quantize executable
set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE llama-quantize-impl)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)

5
tools/quantize/main.cpp Normal file
View File

@ -0,0 +1,5 @@
int llama_quantize(int argc, char ** argv);
int main(int argc, char ** argv) {
return llama_quantize(argc, argv);
}

View File

@ -490,7 +490,10 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
return true;
}
int main(int argc, char ** argv) {
// satisfies -Wmissing-declarations
int llama_quantize(int argc, char ** argv);
int llama_quantize(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
if (argc < 3) {
usage(argv[0]);