mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Update mtmd to improve accuracy of M-RoPE (#993)
* model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206) * feat: Add granite-docling conversion using trillion pretokenizer Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add granite-docling vocab pre enum Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use granite-docling pre Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add clip_is_idefics3 Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Allow multi-token boundary sequences for image templating Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add tiling support for idefices3 in clip.cpp This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Partial support for full templating for idefics3 in mtmd There are still errors encoding some of the image chunks, but the token sequence now matches transformers _almost_ perfectly, except for the double newline before the global image which shows up as two consecutive newline tokens instead of a single double-newline token. I think this is happening because the blocks are tokenized separately then concatenated. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Fully working image preprocessing for idefics3 w/ resize and slicing Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Parse the preprocessor config's longest side and add it to the mmproj hparams Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use the longest side instead of size * scale_factor For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Allow batch encoding and remove clip_is_idefics3 Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Remove unnecessary conditionals for empty token vectors Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Use image_manipulation util Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * add test model --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co> # Conflicts: # convert_hf_to_gguf.py # convert_hf_to_gguf_update.py # gguf-py/gguf/constants.py # gguf-py/gguf/gguf_writer.py # src/llama-vocab.cpp # src/llama-vocab.h * mtmd : support home-cooked Mistral Small Omni (#14928) * model : add LightOnOCR-1B model (#16764) * model : add LightOnOCR-1B model * add test # Conflicts: # convert_hf_to_gguf.py # gguf-py/gguf/constants.py * mtmd : fix idefics3 preprocessing (#16806) * mtmd : fix idefics3 preprocessing * disable granite test * fix test for granite * model: Add support for CogVLM model (#15002) * Added GGUF mappings for CogVLM model * Add tensor mapping for CogVLM visual encoder * Add CogVLM to conversion script, no vision part yet * Added CogVLM vision model to conversion script * Add graph for CogVLM CLIP model * Add graph for CogVLM * Fixes for CogVLM. Now compiles. * Model now runs * Fixes for cogvlm graph * Account for graph context change after rebase * Changes for whitespace * Changes in convert script according to comments * Switch CogVLM LLM graph to merged QKV tensor * Use rope_type variable instead of direct definition * Change CogVLM CLIP encoder to use SWIGLU * Switch CogVLM CLIP to use merged QKV * Apply rebase edits and remove ggml_cont call that is now unnecessary * clean up --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> # Conflicts: # convert_hf_to_gguf.py # examples/mtmd/clip.cpp # gguf-py/gguf/constants.py # gguf-py/gguf/tensor_mapping.py # src/llama-arch.cpp # src/llama-arch.h # src/llama-model.cpp # src/llama-model.h * mtmd: refactor preprocessing + support max/min pixels (#16878) * mtmd: refactor preprocessing + support max/min pixels * fix mlp type * implement mix/max pixels * improve hparams * better image preproc for qwen * fix * fix out of bound composite * fix (2) * fix token calculation * get_merge_kernel_size() * fix llama4 and lfm2 * gonna fix them all * use simple resize for qwen * qwen: increase min tokens * no resize if dst size == src size * restore to initial min/max tokens value for qwen # Conflicts: # examples/mtmd/clip.cpp * clip : use FA (#16837) * clip : use FA * cont : add warning about unsupported ops * implement "auto" mode for clip flash attn * clip : print more detailed op support info during warmup * cont : remove obsolete comment [no ci] * improve debugging message * trailing space * metal : remove stray return --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> * model: add Janus Pro for image understanding (#16906) * Add support for Janus Pro * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Address reviewer suggestions Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Add JANUS_PRO constant * Update clip model handling Co-authored-by: Xuan-Son Nguyen <son@huggingface.co> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * Refactor JANUS_PRO handling in clip.cpp Co-authored-by: Xuan-Son Nguyen <son@huggingface.co> * Update tools/mtmd/clip.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * em whitespace --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Xuan-Son Nguyen <son@huggingface.co> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> # Conflicts: # convert_hf_to_gguf.py # gguf-py/gguf/constants.py # gguf-py/gguf/tensor_mapping.py * mtmd: pad mask for qwen2.5vl (#16954) * mtmd: pad mask for qwen2.5vl * improve * mtmd: add --image-min/max-tokens (#16921) * mtmd: improve struct initialization (#16981) * mtmd: allow QwenVL to process larger image by default (#17020) * Disable flash attention * mtmd : fix embedding size for image input (#17123) * mtmd: fix patch_size initialized to random value in audio models (#17128) * mtmd: fix patch_size initialized to random value in audio models * add default hparams * add llama_model_n_embd_inp * Fix load qwen3 vl Change batch size * Add description * Fix cli build error --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: Tianyue-Zhao <zhaotianyue@outlook.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Zhiyong Wang <85110830+ravenouse@users.noreply.github.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
parent
d24ea9e48e
commit
869557c8fd
@ -1047,11 +1047,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
params.mmproj_use_gpu = false;
|
params.mmproj_use_gpu = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--image") {
|
if (arg == "--image" || arg == "--audio") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.image.emplace_back(argv[i]);
|
params.image.emplace_back(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--image-min-tokens") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.image_min_tokens = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--image-max-tokens") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.image_max_tokens = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-i" || arg == "--interactive") {
|
if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
return true;
|
return true;
|
||||||
@ -2190,6 +2200,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "multi-modality" });
|
options.push_back({ "multi-modality" });
|
||||||
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
||||||
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
||||||
|
options.push_back({ "*", " --image-min-tokens N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)"});
|
||||||
|
options.push_back({ "*", " --image-max-tokens N", "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)" });
|
||||||
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
|
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
|
||||||
options.push_back({ "*", "--context-shift (auto|on|off|0|1)", "set context-shift (default: %s)", params.ctx_shift ? "on" : "off" });
|
options.push_back({ "*", "--context-shift (auto|on|off|0|1)", "set context-shift (default: %s)", params.ctx_shift ? "on" : "off" });
|
||||||
options.push_back({ "backend" });
|
options.push_back({ "backend" });
|
||||||
@ -2992,11 +3004,20 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|||||||
|
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
int n_batch = params.n_batch;
|
||||||
|
int n_ubatch = params.n_ubatch;
|
||||||
|
|
||||||
|
// temporary fix for qwen mtmd
|
||||||
|
if (!params.mmproj.path.empty()) {
|
||||||
|
n_batch = std::max(params.n_batch, params.n_ubatch);
|
||||||
|
n_ubatch = params.n_batch;
|
||||||
|
fprintf(stdout, "Adjust batch size for mtmd: u_batch = %d, batch = %d\n", n_ubatch, n_batch);
|
||||||
|
}
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx;
|
cparams.n_ctx = params.n_ctx;
|
||||||
cparams.n_seq_max = params.n_parallel;
|
cparams.n_seq_max = params.n_parallel;
|
||||||
cparams.n_batch = params.n_batch;
|
cparams.n_batch = n_batch;
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = n_ubatch;
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
|
|||||||
@ -287,6 +287,8 @@ struct gpt_params {
|
|||||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||||
bool no_mmproj = false; // explicitly disable multimodal model
|
bool no_mmproj = false; // explicitly disable multimodal model
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
int image_min_tokens = -1;
|
||||||
|
int image_max_tokens = -1;
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
|
|||||||
@ -29,7 +29,9 @@
|
|||||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||||
|
|
||||||
// vision-specific
|
// vision-specific
|
||||||
|
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||||
|
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||||
@ -47,6 +49,7 @@
|
|||||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||||
|
|
||||||
// audio-specific
|
// audio-specific
|
||||||
|
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||||
|
|
||||||
@ -117,6 +120,14 @@
|
|||||||
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
||||||
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
|
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
|
||||||
|
|
||||||
|
// cogvlm
|
||||||
|
#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
|
||||||
|
#define TN_MM_H_TO_4H "mm.up.%s"
|
||||||
|
#define TN_MM_GATE "mm.gate.%s"
|
||||||
|
#define TN_MM_4H_TO_H "mm.down.%s"
|
||||||
|
#define TN_TOK_BOI "v.boi"
|
||||||
|
#define TN_TOK_EOI "v.eoi"
|
||||||
|
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
||||||
@ -141,6 +152,9 @@ enum projector_type {
|
|||||||
PROJECTOR_TYPE_VOXTRAL,
|
PROJECTOR_TYPE_VOXTRAL,
|
||||||
PROJECTOR_TYPE_LFM2,
|
PROJECTOR_TYPE_LFM2,
|
||||||
PROJECTOR_TYPE_KIMIVL,
|
PROJECTOR_TYPE_KIMIVL,
|
||||||
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
|
PROJECTOR_TYPE_COGVLM,
|
||||||
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -164,6 +178,9 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||||
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
@ -22,9 +23,18 @@ enum clip_modality {
|
|||||||
CLIP_MODALITY_AUDIO,
|
CLIP_MODALITY_AUDIO,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum clip_flash_attn_type {
|
||||||
|
CLIP_FLASH_ATTN_TYPE_AUTO = -1,
|
||||||
|
CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
|
||||||
|
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_context_params {
|
struct clip_context_params {
|
||||||
bool use_gpu;
|
bool use_gpu;
|
||||||
enum ggml_log_level verbosity;
|
enum ggml_log_level verbosity;
|
||||||
|
enum clip_flash_attn_type flash_attn_type;
|
||||||
|
int image_min_tokens;
|
||||||
|
int image_max_tokens;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_init_result {
|
struct clip_init_result {
|
||||||
|
|||||||
@ -182,10 +182,13 @@ struct mtmd_cli_context {
|
|||||||
void init_vision_context(common_params & params) {
|
void init_vision_context(common_params & params) {
|
||||||
const char * clip_path = params.mmproj.path.c_str();
|
const char * clip_path = params.mmproj.path.c_str();
|
||||||
mtmd_context_params mparams = mtmd_context_params_default();
|
mtmd_context_params mparams = mtmd_context_params_default();
|
||||||
mparams.use_gpu = params.mmproj_use_gpu;
|
mparams.use_gpu = params.mmproj_use_gpu;
|
||||||
mparams.print_timings = true;
|
mparams.print_timings = true;
|
||||||
mparams.n_threads = params.n_threads;
|
mparams.n_threads = params.n_threads;
|
||||||
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
||||||
|
mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||||
|
mparams.image_min_tokens = params.image_min_tokens;
|
||||||
|
mparams.image_max_tokens = params.image_max_tokens;
|
||||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||||
if (!ctx_vision.get()) {
|
if (!ctx_vision.get()) {
|
||||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||||
|
|||||||
@ -182,7 +182,7 @@ int32_t mtmd_helper_decode_image_chunk(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const llama_model * model = llama_get_model(lctx);
|
const llama_model * model = llama_get_model(lctx);
|
||||||
int n_mmproj_embd = llama_model_n_embd(model);
|
int n_mmproj_embd = llama_model_n_embd_inp(model);
|
||||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||||
|
|
||||||
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
||||||
|
|||||||
@ -10,7 +10,6 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// represents raw image data, layout is RGBRGBRGB...
|
// represents raw image data, layout is RGBRGBRGB...
|
||||||
@ -76,21 +75,34 @@ enum mtmd_slice_tmpl {
|
|||||||
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
||||||
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
||||||
MTMD_SLICE_TMPL_LLAMA4,
|
MTMD_SLICE_TMPL_LLAMA4,
|
||||||
// TODO @ngxson : add support for idefics (SmolVLM)
|
MTMD_SLICE_TMPL_IDEFICS3,
|
||||||
};
|
};
|
||||||
|
|
||||||
const char * mtmd_default_marker() {
|
const char * mtmd_default_marker() {
|
||||||
return "<__media__>";
|
return "<__media__>";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
|
||||||
|
switch (flash_attn_type) {
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
|
||||||
|
}
|
||||||
|
return CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||||
|
}
|
||||||
|
|
||||||
mtmd_context_params mtmd_context_params_default() {
|
mtmd_context_params mtmd_context_params_default() {
|
||||||
mtmd_context_params params;
|
mtmd_context_params params {
|
||||||
params.use_gpu = true;
|
/* use_gpu */ true,
|
||||||
params.print_timings = true;
|
/* print_timings */ true,
|
||||||
params.n_threads = 4;
|
/* n_threads */ 4,
|
||||||
params.verbosity = GGML_LOG_LEVEL_INFO;
|
/* verbosity */ GGML_LOG_LEVEL_INFO,
|
||||||
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||||
params.media_marker = mtmd_default_marker();
|
/* media_marker */ mtmd_default_marker(),
|
||||||
|
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||||
|
/* image_min_tokens */ -1,
|
||||||
|
/* image_max_tokens */ -1,
|
||||||
|
};
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,19 +126,22 @@ struct mtmd_context {
|
|||||||
// for llava-uhd style models, we need special tokens in-between slices
|
// for llava-uhd style models, we need special tokens in-between slices
|
||||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||||
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
std::vector<llama_token> tok_ov_img_start; // overview image
|
||||||
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
std::vector<llama_token> tok_ov_img_end; // overview image
|
||||||
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
std::vector<llama_token> tok_slices_start; // start of all slices
|
||||||
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
std::vector<llama_token> tok_slices_end; // end of all slices
|
||||||
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
|
std::vector<llama_token> tok_sli_img_start; // single slice start
|
||||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
|
std::vector<llama_token> tok_sli_img_end; // single slice end
|
||||||
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
|
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
|
||||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
std::vector<llama_token> tok_row_end; // end of row
|
||||||
bool tok_row_end_trail = false;
|
bool tok_row_end_trail = false;
|
||||||
bool ov_img_first = false;
|
bool ov_img_first = false;
|
||||||
|
|
||||||
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
||||||
|
|
||||||
|
// string template for slice image delimiters with row/col (idefics3)
|
||||||
|
std::string sli_img_start_tmpl;
|
||||||
|
|
||||||
// for whisper, we pre-calculate the mel filter bank
|
// for whisper, we pre-calculate the mel filter bank
|
||||||
whisper_preprocessor::whisper_filters w_filters;
|
whisper_preprocessor::whisper_filters w_filters;
|
||||||
|
|
||||||
@ -149,9 +164,14 @@ struct mtmd_context {
|
|||||||
throw std::runtime_error("media_marker must not be empty");
|
throw std::runtime_error("media_marker must not be empty");
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_context_params ctx_clip_params;
|
clip_context_params ctx_clip_params {
|
||||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
/* use_gpu */ ctx_params.use_gpu,
|
||||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
/* verbosity */ ctx_params.verbosity,
|
||||||
|
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_DISABLED,
|
||||||
|
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||||
|
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||||
|
};
|
||||||
|
|
||||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||||
ctx_v = res.ctx_v;
|
ctx_v = res.ctx_v;
|
||||||
ctx_a = res.ctx_a;
|
ctx_a = res.ctx_a;
|
||||||
@ -197,13 +217,13 @@ struct mtmd_context {
|
|||||||
// minicpmv 2.5 format:
|
// minicpmv 2.5 format:
|
||||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||||
tok_ov_img_start = lookup_token("<image>");
|
tok_ov_img_start = {lookup_token("<image>")};
|
||||||
tok_ov_img_end = lookup_token("</image>");
|
tok_ov_img_end = {lookup_token("</image>")};
|
||||||
tok_slices_start = lookup_token("<slice>");
|
tok_slices_start = {lookup_token("<slice>")};
|
||||||
tok_slices_end = lookup_token("</slice>");
|
tok_slices_end = {lookup_token("</slice>")};
|
||||||
tok_sli_img_start = tok_ov_img_start;
|
tok_sli_img_start = tok_ov_img_start;
|
||||||
tok_sli_img_end = tok_ov_img_end;
|
tok_sli_img_end = tok_ov_img_end;
|
||||||
tok_row_end = lookup_token("\n");
|
tok_row_end = {lookup_token("\n")};
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
ov_img_first = true;
|
ov_img_first = true;
|
||||||
|
|
||||||
@ -211,11 +231,11 @@ struct mtmd_context {
|
|||||||
// minicpmv 2.6 format:
|
// minicpmv 2.6 format:
|
||||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||||
tok_ov_img_start = lookup_token("<image>");
|
tok_ov_img_start = {lookup_token("<image>")};
|
||||||
tok_ov_img_end = lookup_token("</image>");
|
tok_ov_img_end = {lookup_token("</image>")};
|
||||||
tok_sli_img_start = lookup_token("<slice>");
|
tok_sli_img_start = {lookup_token("<slice>")};
|
||||||
tok_sli_img_end = lookup_token("</slice>");
|
tok_sli_img_end = {lookup_token("</slice>")};
|
||||||
tok_row_end = lookup_token("\n");
|
tok_row_end = {lookup_token("\n")};
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
ov_img_first = true;
|
ov_img_first = true;
|
||||||
|
|
||||||
@ -230,9 +250,9 @@ struct mtmd_context {
|
|||||||
// <|image|> (overview) <-- overview image is last
|
// <|image|> (overview) <-- overview image is last
|
||||||
// <|image_end|>
|
// <|image_end|>
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
||||||
tok_ov_img_start = lookup_token("<|image|>");
|
tok_ov_img_start = {lookup_token("<|image|>")};
|
||||||
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
|
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
|
||||||
tok_row_end = lookup_token("<|tile_y_separator|>");
|
tok_row_end = {lookup_token("<|tile_y_separator|>")};
|
||||||
tok_row_end_trail = true; // add trailing end-of-row token
|
tok_row_end_trail = true; // add trailing end-of-row token
|
||||||
ov_img_first = false; // overview image is last
|
ov_img_first = false; // overview image is last
|
||||||
}
|
}
|
||||||
@ -245,8 +265,12 @@ struct mtmd_context {
|
|||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
||||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||||
img_beg = "<fake_token_around_image><global-img>";
|
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||||
img_end = "<fake_token_around_image>";
|
tok_ov_img_start = {lookup_token("\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||||
|
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||||
|
tok_row_end = {lookup_token("\n")};
|
||||||
|
img_beg = "<fake_token_around_image>";
|
||||||
|
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||||
|
|
||||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
||||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||||
@ -269,6 +293,11 @@ struct mtmd_context {
|
|||||||
img_beg = "<img>";
|
img_beg = "<img>";
|
||||||
img_end = "</img>";
|
img_end = "</img>";
|
||||||
|
|
||||||
|
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
|
||||||
|
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||||
|
img_beg = "<|im_start|>";
|
||||||
|
img_end = "<|im_end|>";
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -358,9 +387,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void mtmd_free(mtmd_context * ctx) {
|
void mtmd_free(mtmd_context * ctx) {
|
||||||
if (ctx) {
|
delete ctx;
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct mtmd_tokenizer {
|
struct mtmd_tokenizer {
|
||||||
@ -504,6 +531,7 @@ struct mtmd_tokenizer {
|
|||||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||||
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|
||||||
) {
|
) {
|
||||||
const int n_col = batch_f32.grid_x;
|
const int n_col = batch_f32.grid_x;
|
||||||
const int n_row = batch_f32.grid_y;
|
const int n_row = batch_f32.grid_y;
|
||||||
@ -517,53 +545,45 @@ struct mtmd_tokenizer {
|
|||||||
|
|
||||||
// add overview image (first)
|
// add overview image (first)
|
||||||
if (ctx->ov_img_first) {
|
if (ctx->ov_img_first) {
|
||||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_ov_img_start);
|
||||||
add_text({ctx->tok_ov_img_start});
|
|
||||||
}
|
|
||||||
cur.entries.emplace_back(std::move(ov_chunk));
|
cur.entries.emplace_back(std::move(ov_chunk));
|
||||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_ov_img_end);
|
||||||
add_text({ctx->tok_ov_img_end});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add slices (or tiles)
|
// add slices (or tiles)
|
||||||
if (!chunks.empty()) {
|
if (!chunks.empty()) {
|
||||||
GGML_ASSERT((int)chunks.size() == n_row * n_col);
|
GGML_ASSERT((int)chunks.size() == n_row * n_col);
|
||||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_slices_start);
|
||||||
add_text({ctx->tok_slices_start});
|
|
||||||
}
|
|
||||||
for (int y = 0; y < n_row; y++) {
|
for (int y = 0; y < n_row; y++) {
|
||||||
for (int x = 0; x < n_col; x++) {
|
for (int x = 0; x < n_col; x++) {
|
||||||
const bool is_last_in_row = (x == n_col - 1);
|
const bool is_last_in_row = (x == n_col - 1);
|
||||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
if (!ctx->tok_sli_img_start.empty()) {
|
||||||
add_text({ctx->tok_sli_img_start});
|
add_text(ctx->tok_sli_img_start);
|
||||||
|
} else if (!ctx->sli_img_start_tmpl.empty()) {
|
||||||
|
// If using a template to preceed a slice image
|
||||||
|
const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
|
||||||
|
std::unique_ptr<char[]> buf(new char[sz]);
|
||||||
|
std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
|
||||||
|
add_text(std::string(buf.get(), buf.get() + sz - 1), true);
|
||||||
}
|
}
|
||||||
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_sli_img_end);
|
||||||
add_text({ctx->tok_sli_img_end});
|
if (!is_last_in_row) {
|
||||||
}
|
add_text(ctx->tok_sli_img_mid);
|
||||||
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
|
||||||
add_text({ctx->tok_sli_img_mid});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
|
||||||
add_text({ctx->tok_row_end});
|
add_text(ctx->tok_row_end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_slices_end);
|
||||||
add_text({ctx->tok_slices_end});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add overview image (last)
|
// add overview image (last)
|
||||||
if (!ctx->ov_img_first) {
|
if (!ctx->ov_img_first) {
|
||||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_ov_img_start);
|
||||||
add_text({ctx->tok_ov_img_start});
|
|
||||||
}
|
|
||||||
cur.entries.emplace_back(std::move(ov_chunk));
|
cur.entries.emplace_back(std::move(ov_chunk));
|
||||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
add_text(ctx->tok_ov_img_end);
|
||||||
add_text({ctx->tok_ov_img_end});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@ -780,7 +800,9 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
|||||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||||
bool ok = false;
|
bool ok = false;
|
||||||
|
|
||||||
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
|
if (clip_is_llava(ctx_clip)
|
||||||
|
|| clip_is_minicpmv(ctx_clip)
|
||||||
|
|| clip_is_glm(ctx_clip)) {
|
||||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||||
const auto & entries = image_tokens->batch_f32.entries;
|
const auto & entries = image_tokens->batch_f32.entries;
|
||||||
for (size_t i = 0; i < entries.size(); i++) {
|
for (size_t i = 0; i < entries.size(); i++) {
|
||||||
|
|||||||
@ -82,6 +82,11 @@ struct mtmd_context_params {
|
|||||||
enum ggml_log_level verbosity;
|
enum ggml_log_level verbosity;
|
||||||
const char * image_marker; // deprecated, use media_marker instead
|
const char * image_marker; // deprecated, use media_marker instead
|
||||||
const char * media_marker;
|
const char * media_marker;
|
||||||
|
enum llama_flash_attn_type flash_attn_type;
|
||||||
|
|
||||||
|
// limit number of image tokens, only for vision models with dynamic resolution
|
||||||
|
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||||
|
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
|
||||||
};
|
};
|
||||||
|
|
||||||
MTMD_API const char * mtmd_default_marker(void);
|
MTMD_API const char * mtmd_default_marker(void);
|
||||||
|
|||||||
@ -69,6 +69,8 @@ add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
|||||||
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||||
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
|
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
|
||||||
|
add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
|
||||||
|
add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
|
||||||
|
|
||||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||||
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||||
@ -137,7 +139,10 @@ for i in "${!arr_hf[@]}"; do
|
|||||||
|
|
||||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||||
|
|
||||||
if echo "$output" | grep -iq "new york"; then
|
# either contains "new york" or both "men" and "walk"
|
||||||
|
if echo "$output" | grep -iq "new york" \
|
||||||
|
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
||||||
|
then
|
||||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||||
else
|
else
|
||||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||||
|
|||||||
@ -1839,7 +1839,10 @@ struct server_context {
|
|||||||
mparams.use_gpu = params.mmproj_use_gpu;
|
mparams.use_gpu = params.mmproj_use_gpu;
|
||||||
mparams.print_timings = false;
|
mparams.print_timings = false;
|
||||||
mparams.n_threads = params.n_threads;
|
mparams.n_threads = params.n_threads;
|
||||||
|
mparams.flash_attn_type = params.flash_attn? LLAMA_FLASH_ATTN_TYPE_ENABLED: LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||||
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
||||||
|
mparams.image_min_tokens = params.image_min_tokens;
|
||||||
|
mparams.image_max_tokens = params.image_max_tokens;
|
||||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
||||||
if (mctx == nullptr) {
|
if (mctx == nullptr) {
|
||||||
LOG_ERROR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
|
LOG_ERROR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
|
||||||
|
|||||||
@ -2574,6 +2574,12 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
|
GGML_API int ggml_graph_size(struct ggml_cgraph* cgraph);
|
||||||
|
GGML_API struct ggml_tensor* ggml_graph_node(struct ggml_cgraph* cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||||
|
GGML_API struct ggml_tensor** ggml_graph_nodes(struct ggml_cgraph* cgraph);
|
||||||
|
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph* cgraph);
|
||||||
|
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
|
|||||||
@ -24439,6 +24439,28 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
|||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_graph_size(struct ggml_cgraph* cgraph) {
|
||||||
|
return cgraph->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* ggml_graph_node(struct ggml_cgraph* cgraph, int i) {
|
||||||
|
if (i < 0) {
|
||||||
|
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||||
|
return cgraph->nodes[cgraph->n_nodes + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(i < cgraph->n_nodes);
|
||||||
|
return cgraph->nodes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor** ggml_graph_nodes(struct ggml_cgraph* cgraph) {
|
||||||
|
return cgraph->nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_graph_n_nodes(struct ggml_cgraph* cgraph) {
|
||||||
|
return cgraph->n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// thread data
|
// thread data
|
||||||
//
|
//
|
||||||
|
|||||||
@ -266,6 +266,12 @@ extern "C" {
|
|||||||
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_flash_attn_type {
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
@ -589,6 +595,8 @@ extern "C" {
|
|||||||
LLAMA_API const struct llama_vocab* llama_get_model_vocab(const struct llama_model* model);
|
LLAMA_API const struct llama_vocab* llama_get_model_vocab(const struct llama_model* model);
|
||||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||||
|
LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model* model);
|
||||||
|
|
||||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||||
|
|
||||||
// Compat
|
// Compat
|
||||||
|
|||||||
@ -27,6 +27,8 @@ const char * llama_hparams::rope_scaling_type_name(llama_rope_scaling_type type)
|
|||||||
return LLAMA_ROPE_SCALING_TYPES.at(type);
|
return LLAMA_ROPE_SCALING_TYPES.at(type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void llm_load_hparams(
|
void llm_load_hparams(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model) {
|
llama_model & model) {
|
||||||
|
|||||||
@ -190,6 +190,16 @@ struct llama_hparams {
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t n_embd_inp() const {
|
||||||
|
uint32_t n_embd_inp = n_embd;
|
||||||
|
|
||||||
|
if (n_deepstack_layers > 0) {
|
||||||
|
n_embd_inp += n_embd * n_deepstack_layers;
|
||||||
|
}
|
||||||
|
|
||||||
|
return n_embd_inp;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t n_ff(uint32_t il = 0) const {
|
uint32_t n_ff(uint32_t il = 0) const {
|
||||||
if (il < n_layer) {
|
if (il < n_layer) {
|
||||||
return n_ff_arr[il];
|
return n_ff_arr[il];
|
||||||
|
|||||||
@ -346,6 +346,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
};
|
};
|
||||||
@ -1962,8 +1963,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "bailingmoe" ||
|
tokenizer_pre == "granite-docling") {
|
||||||
tokenizer_pre == "bailingmoe2" ||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
|
||||||
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "bailingmoe" ||
|
||||||
|
tokenizer_pre == "bailingmoe2"||
|
||||||
tokenizer_pre == "llada-moe") {
|
tokenizer_pre == "llada-moe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
|||||||
@ -8,47 +8,48 @@
|
|||||||
|
|
||||||
// pre-tokenization types
|
// pre-tokenization types
|
||||||
enum llama_vocab_pre_type {
|
enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 40,
|
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 40,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 41,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|||||||
@ -4773,6 +4773,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
|||||||
return model->hparams.n_embd;
|
return model->hparams.n_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t llama_model_n_embd_inp(const llama_model* model) {
|
||||||
|
return model->hparams.n_embd_inp();
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_n_layer(const struct llama_model * model) {
|
int32_t llama_n_layer(const struct llama_model * model) {
|
||||||
return model->hparams.n_layer;
|
return model->hparams.n_layer;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2431,17 +2431,47 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||||||
test_cases.emplace_back(new test_timestep_embedding());
|
test_cases.emplace_back(new test_timestep_embedding());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
for (int hs : { 64, 80, 128, 256, }) {
|
for (bool v : {false, true}) {
|
||||||
for (bool mask : { true, false } ) {
|
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
|
||||||
for (float max_bias : { 0.0f, 8.0f }) {
|
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v));
|
||||||
if (!mask && max_bias > 0.0f) continue;
|
}
|
||||||
for (float softcap : {0.0f, 10.0f}) {
|
|
||||||
if (hs != 128 && softcap != 0.0f) continue;
|
for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) {
|
||||||
for (int nh : { 32, }) {
|
for (int hsv : { 40, 64, 72, 80, 96, 128, 192, 256, 512 }) {
|
||||||
for (int kv : { 512, 1024, }) {
|
if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
|
||||||
for (int nb : { 1, 2, 4, 8, }) {
|
if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
|
||||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
|
||||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, softcap, type_KV));
|
|
||||||
|
for (bool mask : { true, false } ) {
|
||||||
|
for (bool sinks : { true, false } ) {
|
||||||
|
for (float max_bias : { 0.0f, 8.0f }) {
|
||||||
|
if (!mask && max_bias > 0.0f) continue;
|
||||||
|
for (float logit_softcap : {0.0f, 10.0f}) {
|
||||||
|
if (hsk != 128 && logit_softcap != 0.0f) continue;
|
||||||
|
for (int nh : { 4, }) {
|
||||||
|
for (int nr3 : { 1, 3, }) {
|
||||||
|
if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
|
||||||
|
for (int nr2 : { 1, 4, 16 }) {
|
||||||
|
if (nr2 == 16 && hsk != 128) continue;
|
||||||
|
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
|
||||||
|
for (int kv : { 113, 512, 1024, }) {
|
||||||
|
if (nr2 != 1 && kv != 512) continue;
|
||||||
|
for (int nb : { 1, 3, 32, 35, }) {
|
||||||
|
for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
|
||||||
|
if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
|
||||||
|
for (ggml_type type_KV : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||||
|
test_cases.emplace_back(new test_flash_attn_ext(
|
||||||
|
hsk, hsv, nh, {nr2, nr3}, kv, nb, mask, sinks, max_bias, logit_softcap, prec, type_KV));
|
||||||
|
// run fewer test cases permuted
|
||||||
|
if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
|
||||||
|
test_cases.emplace_back(new test_flash_attn_ext(
|
||||||
|
hsk, hsv, nh, {nr2, nr3}, kv, nb, mask, sinks, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user