mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
* host-swap tensor loop the host-swap functionality is only triggered when the certain env. variables are declared * target_include_directories tweak * hot-swap tensor support two intrusions: 1.) at the model loading to collect the snapshot 2.) the modification of the `/health` HTTP endpoint to be able to trigger the hot-swap via sending the `llama-server` the HTTP-request. *both a braced by the specific env. variables * hot-swap tensor support; graph invalidation ggml_backend_cuda_invalidate_graphs export * hot-swap tensor support graph invalidation implementation; extended debug output (commented out) * llama_reload_changed_tensors export * tensor hot-swap on-demand reload cpu-only/hybrid/gpu-only with split mode layer/graph full support implementation * docs * reuse the gguf parsing from llama.cpp gguf_init_from_file, gguf_find_tensor, ggml_get_tensor * remove the manual scheduling for hybrid inference * update docs * tensor shape validation * update docs * update docs accidentally wiped the previous changes; so recovered them * revert the GGML_CUDA_MAX_DEVICES to 16 * update llama_reload_changed_tensor update llama_reload_changed_tensor, revert CMakeLists.txt * update llama_reload_changed_tensor * GGML_MAX_SRC GGML_MAX_SRC compile-time definition support * GGML_MAX_SRC GGML_MAX_SRC compile-time definition support * GGML_MAX_SRC GGML_MAX_SRC compile-time definition support * llama_reload_changed_tensor update llama_reload_changed_tensor definition * refactory move the tensor-reloading implementation to llama-reload.cpp, llama-reload-info.h; some bugfixes and code reduction * revert added back the missing newline * update docs * reload_info constructor * bugfix: cpu-only TODO: improve the working environment by compiling for multiple hardware configurations; possibly make a test pipeline * cpu-only bugfix set the fix again after unsuccessful sync with main * windows os compilation fix #include <string> * fix windows os build error C2039: 'string': is not a member of 'std' * remove dead file * implement perplexity in server * Revert "implement perplexity in server"
73 lines
1.5 KiB
C++
73 lines
1.5 KiB
C++
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
struct llama_file;
|
|
struct llama_mmap;
|
|
struct llama_mlock;
|
|
|
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
|
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
|
|
struct llama_file {
|
|
llama_file(const char * fname, const char * mode);
|
|
~llama_file();
|
|
|
|
size_t tell() const;
|
|
size_t size() const;
|
|
|
|
int file_id() const; // fileno overload
|
|
|
|
void seek(size_t offset, int whence) const;
|
|
|
|
void read_raw(void * ptr, size_t len) const;
|
|
uint32_t read_u32() const;
|
|
|
|
void write_raw(const void * ptr, size_t len) const;
|
|
void write_u32(uint32_t val) const;
|
|
const std::string & get_path() const;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
struct llama_mmap {
|
|
llama_mmap(const llama_mmap &) = delete;
|
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool use_thp = false);
|
|
~llama_mmap();
|
|
|
|
size_t size() const;
|
|
void * addr() const;
|
|
|
|
void dontneed_fragment(size_t first, size_t last);
|
|
|
|
void unmap_fragment(size_t first, size_t last);
|
|
|
|
static const bool SUPPORTED;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
struct llama_mlock {
|
|
llama_mlock();
|
|
~llama_mlock();
|
|
|
|
void init(void * ptr);
|
|
void grow_to(size_t target_size);
|
|
|
|
static const bool SUPPORTED;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
size_t llama_path_max();
|