From af62a37acd9ec6ea17dbb9892e9188d749d75e22 Mon Sep 17 00:00:00 2001 From: Farmadupe Date: Fri, 26 Jun 2026 07:48:48 +0100 Subject: [PATCH] Prune examples/llava. Dead code. (#2025) examples/llava has been replaced by mtmd since late 2025, and has been out-of-build in ik_llama.cpp since examples/CMakeLists.txt removed it in #798. Repointed descriptions from llava to mtmd where they remained. --- common/common.cpp | 2 +- examples/llava/CMakeLists.txt | 45 - examples/llava/MobileVLM-README.md | 377 --- examples/llava/README-minicpmv2.5.md | 99 - examples/llava/README.md | 139 - examples/llava/android/adb_run.sh | 53 - examples/llava/android/build_64.sh | 8 - examples/llava/clip.cpp | 2560 ----------------- examples/llava/clip.h | 94 - .../llava/convert_image_encoder_to_gguf.py | 333 --- examples/llava/llava-cli.cpp | 340 --- examples/llava/llava.cpp | 480 ---- examples/llava/llava.h | 49 - examples/llava/llava_surgery.py | 38 - examples/llava/llava_surgery_v2.py | 159 - examples/llava/minicpmv-cli.cpp | 309 -- .../minicpmv-convert-image-encoder-to-gguf.py | 382 --- examples/llava/minicpmv-surgery.py | 47 - examples/llava/requirements.txt | 5 - examples/server/README.md | 2 +- requirements/requirements-all.txt | 1 - 21 files changed, 2 insertions(+), 5520 deletions(-) delete mode 100644 examples/llava/CMakeLists.txt delete mode 100644 examples/llava/MobileVLM-README.md delete mode 100644 examples/llava/README-minicpmv2.5.md delete mode 100644 examples/llava/README.md delete mode 100755 examples/llava/android/adb_run.sh delete mode 100755 examples/llava/android/build_64.sh delete mode 100644 examples/llava/clip.cpp delete mode 100644 examples/llava/clip.h delete mode 100644 examples/llava/convert_image_encoder_to_gguf.py delete mode 100644 examples/llava/llava-cli.cpp delete mode 100644 examples/llava/llava.cpp delete mode 100644 examples/llava/llava.h delete mode 100644 examples/llava/llava_surgery.py delete mode 100644 examples/llava/llava_surgery_v2.py delete mode 100644 examples/llava/minicpmv-cli.cpp delete mode 100644 examples/llava/minicpmv-convert-image-encoder-to-gguf.py delete mode 100644 examples/llava/minicpmv-surgery.py delete mode 100644 examples/llava/requirements.txt diff --git a/common/common.cpp b/common/common.cpp index e189c79f..5dc820ca 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3205,7 +3205,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); options.push_back({ "multi-modality" }); - options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); + options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file. see examples/mtmd/README.md" }); options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); options.push_back({ "*", " --image-min-tokens N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)"}); options.push_back({ "*", " --image-max-tokens N", "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)" }); diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt deleted file mode 100644 index bbf5fec5..00000000 --- a/examples/llava/CMakeLists.txt +++ /dev/null @@ -1,45 +0,0 @@ -add_library(llava OBJECT - llava.cpp - llava.h - clip.cpp - clip.h - ) - -target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) - -target_include_directories(llava PUBLIC .) -target_include_directories(llava PUBLIC ../..) -target_include_directories(llava PUBLIC ../../common) - -target_compile_features(llava PRIVATE cxx_std_11) - -add_library(llava_static STATIC $) -if (BUILD_SHARED_LIBS) - set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD) - add_library(llava_shared SHARED $) - target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) - install(TARGETS llava_shared LIBRARY) -endif() - -if (NOT MSVC) - target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h -endif() - -if(TARGET BUILD_INFO) - add_dependencies(llava BUILD_INFO) -endif() - -set(TARGET llama-llava-cli) -add_executable(${TARGET} llava-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TARGET llama-minicpmv-cli) -add_executable(${TARGET} minicpmv-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md deleted file mode 100644 index 06a65fba..00000000 --- a/examples/llava/MobileVLM-README.md +++ /dev/null @@ -1,377 +0,0 @@ -# MobileVLM - -Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants. - -for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) - -The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. - -Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. - -## Usage -Build with cmake or run `make llama-llava-cli` to build it. - -After building, run: `./llama-llava-cli` to see the usage. For example: - -```sh -./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ - --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ - --image path/to/an/image.jpg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" -``` - -## Model conversion - -1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: - -```sh -git clone https://huggingface.co/mtgv/MobileVLM-1.7B - -git clone https://huggingface.co/openai/clip-vit-large-patch14-336 -``` - -2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: - -```sh -python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B -``` - -3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: - -```sh -python ./examples/llava/convert_image_encoder_to_gguf \ - -m path/to/clip-vit-large-patch14-336 \ - --llava-projector path/to/MobileVLM-1.7B/llava.projector \ - --output-dir path/to/MobileVLM-1.7B \ - --projector-type ldp -``` - -```sh -python ./examples/llava/convert_image_encoder_to_gguf \ - -m path/to/clip-vit-large-patch14-336 \ - --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ - --output-dir path/to/MobileVLM-1.7B_V2 \ - --projector-type ldpv2 -``` - -4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: - -```sh -python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B -``` - -5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` -```sh -./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s -``` - -Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. - -## Android compile and run -### compile -refer to `examples/llava/android/build_64.sh` -```sh -mkdir examples/llava/android/build_64 -cd examples/llava/android/build_64 -../build_64.sh -``` -### run on Android -refer to `android/adb_run.sh`, modify resources' `name` and `path` - -## Some result on Android with `Snapdragon 888` chip -### case 1 -**input** -```sh -/data/local/tmp/llama-llava-cli \ - -m /data/local/tmp/ggml-model-q4_k.gguf \ - --mmproj /data/local/tmp/mmproj-model-f16.gguf \ - -t 4 \ - --image /data/local/tmp/demo.jpg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" -``` -**output** -```sh -encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch) - Susan Wise Bauer -llama_print_timings: load time = 23574.72 ms -llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second) -llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second) -llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second) -llama_print_timings: total time = 34731.93 ms -``` -### case 2 -**input** -```sh -/data/local/tmp/llama-llava-cli \ - -m /data/local/tmp/ggml-model-q4_k.gguf \ - --mmproj /data/local/tmp/mmproj-model-f16.gguf \ - -t 4 \ - --image /data/local/tmp/cat.jpeg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" -``` -**output** -```sh -encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch) - The image depicts a cat sitting in the grass near some tall green plants. -llama_print_timings: load time = 23257.32 ms -llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second) -llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second) -llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second) -llama_print_timings: total time = 34570.79 ms -``` - - -## Some result on Android with `Snapdragon 778G` chip -### MobileVLM-1.7B case -#### llava-cli release-b2005 -**input** -```sh -/data/local/tmp/llama-llava-cli \ - -m /data/local/tmp/ggml-model-q4_k.gguf \ - --mmproj /data/local/tmp/mmproj-model-f16.gguf \ - -t 4 \ - --image /data/local/tmp/many_llamas.jpeg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" -``` -**output** -```sh -encode_image_with_clip: image encoded in 18728.52 ms by CLIP ( 130.06 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that? ASSISTANT: - - A group of llamas are standing in a green pasture. - -llama_print_timings: load time = 20357.33 ms -llama_print_timings: sample time = 2.96 ms / 14 runs ( 0.21 ms per token, 4734.53 tokens per second) -llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 ms per token, 23.52 tokens per second) -llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second) -llama_print_timings: total time = 28038.34 ms / 205 tokens -``` -#### llava-cli latest-version -**input** - -Just the same as above. - -**output**(seems to be much slower) -```sh -encode_image_with_clip: image embedding created: 144 tokens - -encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that? ASSISTANT: - - It is a group of sheep standing together in a grass field. - -llama_print_timings: load time = 818120.91 ms -llama_print_timings: sample time = 3.44 ms / 14 runs ( 0.25 ms per token, 4067.40 tokens per second) -llama_print_timings: prompt eval time = 529274.69 ms / 191 tokens ( 2771.07 ms per token, 0.36 tokens per second) -llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 ms per token, 0.30 tokens per second) -llama_print_timings: total time = 865441.76 ms / 204 tokens -``` -### MobileVLM_V2-1.7B case -#### llava-cli release-2005b -**input** - -Just the same as above. - -**output** -```sh -encode_image_with_clip: image encoded in 20609.61 ms by CLIP ( 143.12 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that? ASSISTANT: - - This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting. - -The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama - -llama_print_timings: load time = 22406.77 ms -llama_print_timings: sample time = 49.26 ms / 186 runs ( 0.26 ms per token, 3776.27 tokens per second) -llama_print_timings: prompt eval time = 9044.54 ms / 191 tokens ( 47.35 ms per token, 21.12 tokens per second) -llama_print_timings: eval time = 14497.49 ms / 186 runs ( 77.94 ms per token, 12.83 tokens per second) -llama_print_timings: total time = 44411.01 ms / 377 tokens -``` - -## Orin compile and run -### compile -```sh -make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 -``` -### run on Orin -### case 1 -**input** -```sh -./llama-llava-cli \ - -m /data/local/tmp/ggml-model-q4_k.gguf \ - --mmproj /data/local/tmp/mmproj-model-f16.gguf \ - --image /data/local/tmp/demo.jpeg \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \ - --n-gpu-layers 999 -``` -**output** -```sh - -encode_image_with_clip: image encoded in 296.62 ms by CLIP ( 2.06 ms per image patch) - - Susan Wise Bauer - -llama_print_timings: load time = 1067.64 ms -llama_print_timings: sample time = 1.53 ms / 6 runs ( 0.25 ms per token, 3934.43 tokens per second) -llama_print_timings: prompt eval time = 306.84 ms / 246 tokens ( 1.25 ms per token, 801.72 tokens per second) -llama_print_timings: eval time = 91.50 ms / 6 runs ( 15.25 ms per token, 65.58 tokens per second) -llama_print_timings: total time = 1352.63 ms / 252 tokens -``` - -### case 2 -**input** -```sh -./llama-llava-cli \ - -m /data/local/tmp/ggml-model-q4_k.gguf \ - --mmproj /data/local/tmp/mmproj-model-f16.gguf \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" \ - --n-gpu-layers 999 - -``` -**output** -```sh -encode_image_with_clip: image encoded in 302.15 ms by CLIP ( 2.10 ms per image patch) - - The image features a cat lying in the grass. - -llama_print_timings: load time = 1057.07 ms -llama_print_timings: sample time = 3.27 ms / 11 runs ( 0.30 ms per token, 3360.83 tokens per second) -llama_print_timings: prompt eval time = 213.60 ms / 232 tokens ( 0.92 ms per token, 1086.14 tokens per second) -llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 ms per token, 66.01 tokens per second) -llama_print_timings: total time = 1365.47 ms / 243 tokens -``` - -## Running on Intel(R) Core(TM) i7-10750H -### Operating system -Ubuntu22.04 -### compile -```sh -make -j32 -``` -### MobileVLM-1.7B case -**input** -```sh --m /path/to/ggml-model-q4_k.gguf \ - --mmproj /path/to/mmproj-model-f16.gguf \ - --image /path/to/many_llamas.jpeg - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" \ -``` -**output** -```sh -encode_image_with_clip: image embedding created: 144 tokens - -encode_image_with_clip: image encoded in 2730.94 ms by CLIP ( 18.96 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that?ASSISTANT: - - A group of llamas are walking together in a field. - -llama_print_timings: load time = 5506.60 ms -llama_print_timings: sample time = 0.44 ms / 13 runs ( 0.03 ms per token, 29545.45 tokens per second) -llama_print_timings: prompt eval time = 2031.58 ms / 190 tokens ( 10.69 ms per token, 93.52 tokens per second) -llama_print_timings: eval time = 438.92 ms / 12 runs ( 36.58 ms per token, 27.34 tokens per second) -llama_print_timings: total time = 5990.25 ms / 202 tokens -``` - -### MobileVLM_V2-1.7B case -**input** - -Just the same as above. - -**ouput** -```sh -encode_image_with_clip: image embedding created: 144 tokens - -encode_image_with_clip: image encoded in 3223.89 ms by CLIP ( 22.39 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that?ASSISTANT: - - The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order. - -The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment. - -The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park. - -llama_print_timings: load time = 6642.61 ms -llama_print_timings: sample time = 8.15 ms / 223 runs ( 0.04 ms per token, 27358.61 tokens per second) -llama_print_timings: prompt eval time = 2475.07 ms / 190 tokens ( 13.03 ms per token, 76.77 tokens per second) -llama_print_timings: eval time = 8760.60 ms / 222 runs ( 39.46 ms per token, 25.34 tokens per second) -llama_print_timings: total time = 15513.95 ms / 412 tokens -``` - -## Run on Intel(R) Core(TM) Ultra7 115H -### operation system -Windows11 -### comiple -```sh -make -j32 -``` -### MobileVLM-1.7B case -**input** -```sh --m /path/to/ggml-model-q4_k.gguf \ - --mmproj /path/to/tmp/mmproj-model-f16.gguf \ - -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" \ -``` -**output** -```sh -encode_image_with_clip: image encoded in 4902.81 ms by CLIP ( 34.05 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that? ASSISTANT: - - The image features a group of brown and white llamas standing in a grassy field. - -llama_print_timings: load time = 7441.06 ms -llama_print_timings: sample time = 0.72 ms / 19 runs ( 0.04 ms per token, 26279.39 tokens per second) -llama_print_timings: prompt eval time = 2090.71 ms / 191 tokens ( 10.95 ms per token, 91.36 tokens per second) -llama_print_timings: eval time = 512.35 ms / 18 runs ( 28.46 ms per token, 35.13 tokens per second) -llama_print_timings: total time = 7987.23 ms / 209 tokens -``` - -### MobileVLM_V2-1.7B case -**input** - -Just the same as above. - -**output** -```sh -encode_image_with_clip: image encoded in 4682.44 ms by CLIP ( 32.52 ms per image patch) -system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: -user_prompt: \nWhat's that? ASSISTANT: - - This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One - of them, possibly the first in the line, has its back turned, perhaps observing something in the distance. - -The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer. - -The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in - front is not visible, indicating that it might not be the main focus of the photo. - -The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves. - - -llama_print_timings: load time = 7015.35 ms -llama_print_timings: sample time = 10.61 ms / 256 runs ( 0.04 ms per token, 24119.09 tokens per second) -llama_print_timings: prompt eval time = 2052.45 ms / 191 tokens ( 10.75 ms per token, 93.06 tokens per second) -llama_print_timings: eval time = 7259.43 ms / 255 runs ( 28.47 ms per token, 35.13 tokens per second) -llama_print_timings: total time = 14371.19 ms / 446 tokens -``` - -## TODO - -- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid` -- [ ] Optimize LDP projector performance - - - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`; - - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc. -- [x] run MobileVLM on `Jetson Orin` -- [ ] Support more model variants, such as `MobileVLM-3B`. - - -## contributor -```sh -zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77 -``` diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md deleted file mode 100644 index 4affc1d0..00000000 --- a/examples/llava/README-minicpmv2.5.md +++ /dev/null @@ -1,99 +0,0 @@ -## MiniCPM-Llama3-V 2.5 - -### Prepare models and code - -Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. - -Clone llama.cpp: -```bash -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp -``` - -### Usage - -Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) - -```bash -python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 -python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model - -# quantize int4 version -./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M -``` - -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` - -Inference on Linux or Mac -``` -# run f16 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -``` diff --git a/examples/llava/README.md b/examples/llava/README.md deleted file mode 100644 index 01245136..00000000 --- a/examples/llava/README.md +++ /dev/null @@ -1,139 +0,0 @@ -# LLaVA - -Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants, -as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants. - -The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) -and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) -models are available. -For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf) - -After API is confirmed, more models will be supported / uploaded. - -## Usage -Build with cmake or run `make llama-llava-cli` to build it. - -After building, run: `./llama-llava-cli` to see the usage. For example: - -```sh -./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg -``` - -**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. -**note**: For GPU offloading ensure to use the `-ngl` flag just like usual - -## LLaVA 1.5 - -1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: - -```sh -git clone https://huggingface.co/liuhaotian/llava-v1.5-7b - -git clone https://huggingface.co/openai/clip-vit-large-patch14-336 -``` - -2. Install the required Python packages: - -```sh -pip install -r examples/llava/requirements.txt -``` - -3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: - -```sh -python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b -``` - -4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: - -```sh -python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b -``` - -5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: - -```sh -python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown -``` - -Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. - -## LLaVA 1.6 gguf conversion -1) First clone a LLaVA 1.6 model: -```console -git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b -``` - -2) Install the required Python packages: - -```sh -pip install -r examples/llava/requirements.txt -``` - -3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: -```console -python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ -``` -- you will find a llava.projector and a llava.clip file in your model directory - -4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: -```console -mkdir vit -cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin -cp ../llava-v1.6-vicuna-7b/llava.projector vit/ -curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json -``` - -5) Create the visual gguf model: -```console -python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision -``` -- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP - -6) Then convert the model to gguf format: -```console -python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown -``` - -7) And finally we can run the llava cli using the 1.6 model version: -```console -./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 -``` - -**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) -**note** llava-1.6 greatly benefits from batched prompt processing (defaults work) - -## llava-cli templating and llava-1.6 prompting - -llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."` -For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system: - -**For Mistral and using llava-cli binary:** -Add this: `-p "\nUSER:\nProvide a full description.\nASSISTANT:\n"` -The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role - -**For the 34B this should work:** -Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nProvide a full description.<|im_end|><|im_start|>assistant\n` - - -## How to know if you are running in llava-1.5 or llava-1.6 mode - -When running llava-cli you will see a visual information right before the prompt is being processed: - -**Llava-1.5:** -`encode_image_with_clip: image embedding created: 576 tokens` - -**Llava-1.6 (anything above 576):** -`encode_image_with_clip: image embedding created: 2880 tokens` - - -Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6 - - - - -## TODO - -- [x] Support non-CPU backend for the image encoding part. -- [ ] Support different sampling methods. -- [ ] Support more model variants. diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh deleted file mode 100755 index 45ccf8d7..00000000 --- a/examples/llava/android/adb_run.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed" -projector_name="mmproj-model-f16.gguf" -llama_name="ggml-model-q4_k.gguf" -img_dir="/Users/cxt/model/llm" -img_name="demo.jpg" -prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" -# img_name="cat.jpeg" -# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" - -program_dir="build_64/bin" -binName="llama-llava-cli" -n_threads=4 - - -deviceDir="/data/local/tmp" -saveDir="output" -if [ ! -d ${saveDir} ]; then - mkdir ${saveDir} -fi - - -function android_run() { - # # copy resource into device - # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name} - # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name} - adb push ${img_dir}/${img_name} ${deviceDir}/${img_name} - # copy program into device - adb push ${program_dir}/${binName} ${deviceDir}/${binName} - adb shell "chmod 0777 ${deviceDir}/${binName}" - - # run - adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \ - -m ${deviceDir}/${llama_name} \ - --mmproj ${deviceDir}/${projector_name} \ - -t ${n_threads} \ - --image ${deviceDir}/${img_name} \ - -p \"${prompt}\" \ - > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt" - adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \ - -m ${deviceDir}/${llama_name} \ - --mmproj ${deviceDir}/${projector_name} \ - -t ${n_threads} \ - --image ${deviceDir}/${img_name} \ - -p \"${prompt}\" \ - >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1" - adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir} -} - -android_run - -echo "android_run is Done!" diff --git a/examples/llava/android/build_64.sh b/examples/llava/android/build_64.sh deleted file mode 100755 index 71b6fd3f..00000000 --- a/examples/llava/android/build_64.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -cmake ../../../../ \ --DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ --DCMAKE_BUILD_TYPE=Release \ --DANDROID_ABI="arm64-v8a" \ --DANDROID_PLATFORM=android-23 $1 - -make -j4 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp deleted file mode 100644 index 54aa822c..00000000 --- a/examples/llava/clip.cpp +++ /dev/null @@ -1,2560 +0,0 @@ -// NOTE: This is modified from clip.cpp only for LLaVA, -// so there might be still unnecessary artifacts hanging around -// I'll gradually clean and extend it -// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch -#include "clip.h" -#include "log.h" -#include "ggml.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#define STB_IMAGE_IMPLEMENTATION -#include "stb_image.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//#define CLIP_DEBUG_FUNCTIONS - -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), buf.size()); -} - -// -// key constants -// - -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" - -#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" -#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" -#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" - - -// -// tensor name constants -// - -#define TN_TOKEN_EMBD "%s.token_embd.weight" -#define TN_POS_EMBD "%s.position_embd.weight" -#define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" -#define TN_PATCH_BIAS "v.patch_embd.bias" -#define TN_ATTN_K "%s.blk.%d.attn_k.%s" -#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" -#define TN_ATTN_V "%s.blk.%d.attn_v.%s" -#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" -#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" -#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" -#define TN_LN_1 "%s.blk.%d.ln1.%s" -#define TN_LN_2 "%s.blk.%d.ln2.%s" -#define TN_LN_PRE "%s.pre_ln.%s" -#define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" -#define TN_LLAVA_PROJ "mm.%d.%s" -#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" -#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" -#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" -#define TN_IMAGE_NEWLINE "model.image_newline" - -#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" -#define TN_MINICPMV_QUERY "resampler.query" -#define TN_MINICPMV_PROJ "resampler.proj.weight" -#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" -#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" -#define TN_MINICPMV_LN "resampler.ln_%s.%s" - - -enum projector_type { - PROJECTOR_TYPE_MLP, - PROJECTOR_TYPE_MLP_NORM, - PROJECTOR_TYPE_LDP, - PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, - PROJECTOR_TYPE_UNKNOWN, -}; - -static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, -}; - - -// -// utilities to get data from a gguf file -// - -static int get_key_idx(const gguf_context * ctx, const char * key) { - int i = gguf_find_key(ctx, key); - if (i == -1) { - LOG_TEE("key %s not found in file\n", key); - throw std::runtime_error(format("Missing required key: %s", key)); - } - - return i; -} - -static uint32_t get_u32(const gguf_context * ctx, const std::string & key) { - const int i = get_key_idx(ctx, key.c_str()); - - return gguf_get_val_u32(ctx, i); -} - -static float get_f32(const gguf_context * ctx, const std::string & key) { - const int i = get_key_idx(ctx, key.c_str()); - - return gguf_get_val_f32(ctx, i); -} - -static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if (!cur) { - throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str())); - } - - return cur; -} - -static std::string get_ftype(int ftype) { - return ggml_type_name(static_cast(ftype)); -} - -static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { - switch (type) { - case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); - case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); - case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); - case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); - case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); - case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); - case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); - case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); - case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); - case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); - case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; - default: return format("unknown type %d", type); - } -} - -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; // Avoid infinite loop if 'search' is an empty string - } - size_t pos = 0; - while ((pos = s.find(search, pos)) != std::string::npos) { - s.replace(pos, search.length(), replace); - pos += replace.length(); - } -} - -static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); - - switch (type) { - case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); - case GGUF_TYPE_ARRAY: - { - const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); - int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = gguf_get_arr_data(ctx_gguf, i); - std::stringstream ss; - ss << "["; - for (int j = 0; j < arr_n; j++) { - if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); - // escape quotes - replace_all(val, "\\", "\\\\"); - replace_all(val, "\"", "\\\""); - ss << '"' << val << '"'; - } else if (arr_type == GGUF_TYPE_ARRAY) { - ss << "???"; - } else { - ss << gguf_data_to_str(arr_type, data, j); - } - if (j < arr_n - 1) { - ss << ", "; - } - } - ss << "]"; - return ss.str(); - } - default: - return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); - } -} - -static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { - size_t tensor_size = ggml_nbytes(tensor); - LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", - prefix, ggml_n_dims(tensor), tensor->name, tensor_size, - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); -} - -static projector_type clip_projector_type_from_string(const std::string & name) { - for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT - if (kv.second == name) { - return kv.first; - } - } - return PROJECTOR_TYPE_UNKNOWN; -} - -#ifdef CLIP_DEBUG_FUNCTIONS -static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; - - // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { - // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); - } - - file.close(); -} - -static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data - int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; - int paddingAmount = (4 - (widthInBytes % 4)) % 4; - int stride = widthInBytes + paddingAmount; - - // Bitmap file header - unsigned char fileHeader[14] = { - 'B','M', // Signature - 0,0,0,0, // Image file size in bytes - 0,0,0,0, // Reserved - 54,0,0,0 // Start of pixel array - }; - - // Total file size - fileSize = 54 + (stride * img.ny); - fileHeader[2] = (unsigned char)(fileSize); - fileHeader[3] = (unsigned char)(fileSize >> 8); - fileHeader[4] = (unsigned char)(fileSize >> 16); - fileHeader[5] = (unsigned char)(fileSize >> 24); - - // Bitmap information header (BITMAPINFOHEADER) - unsigned char infoHeader[40] = { - 40,0,0,0, // Size of this header (40 bytes) - 0,0,0,0, // Image width - 0,0,0,0, // Image height - 1,0, // Number of color planes - 24,0, // Bits per pixel - 0,0,0,0, // No compression - 0,0,0,0, // Image size (can be 0 for no compression) - 0,0,0,0, // X pixels per meter (not specified) - 0,0,0,0, // Y pixels per meter (not specified) - 0,0,0,0, // Total colors (color table not used) - 0,0,0,0 // Important colors (all are important) - }; - - // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); - - // Write file headers - file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); - file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); - - // Pixel data - std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { - // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; - unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] - }; - file.write(reinterpret_cast(pixel), 3); - } - // Write padding for the row - file.write(reinterpret_cast(padding.data()), paddingAmount); - } - - file.close(); -} - -// debug function to convert f32 to u8 -static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); - } -} -#endif - - -// -// clip layers -// - -struct clip_hparams { - int32_t image_size; - int32_t patch_size; - int32_t hidden_size; - int32_t n_intermediate; - int32_t projection_dim; - int32_t n_head; - int32_t n_layer; - - float eps; - - char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) - - int32_t image_grid_pinpoints[32]; - int32_t image_crop_resolution; -}; - -struct clip_layer { - // attention - struct ggml_tensor * k_w; - struct ggml_tensor * k_b; - struct ggml_tensor * q_w; - struct ggml_tensor * q_b; - struct ggml_tensor * v_w; - struct ggml_tensor * v_b; - - struct ggml_tensor * o_w; - struct ggml_tensor * o_b; - - // layernorm 1 - struct ggml_tensor * ln_1_w; - struct ggml_tensor * ln_1_b; - - // ff - struct ggml_tensor * ff_i_w; - struct ggml_tensor * ff_i_b; - - struct ggml_tensor * ff_o_w; - struct ggml_tensor * ff_o_b; - - // layernorm 2 - struct ggml_tensor * ln_2_w; - struct ggml_tensor * ln_2_b; -}; - -struct clip_vision_model { - struct clip_hparams hparams; - - // embeddings - struct ggml_tensor * class_embedding; - struct ggml_tensor * patch_embeddings; - struct ggml_tensor * patch_bias; - struct ggml_tensor * position_embeddings; - - struct ggml_tensor * pre_ln_w; - struct ggml_tensor * pre_ln_b; - - std::vector layers; - - struct ggml_tensor * post_ln_w; - struct ggml_tensor * post_ln_b; - - struct ggml_tensor * projection; - - // LLaVA projection - struct ggml_tensor * mm_0_w = NULL; - struct ggml_tensor * mm_0_b = NULL; - struct ggml_tensor * mm_2_w = NULL; - struct ggml_tensor * mm_2_b = NULL; - - struct ggml_tensor * image_newline = NULL; - - // Yi type models with mlp+normalization projection - struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 - struct ggml_tensor * mm_1_b = NULL; - struct ggml_tensor * mm_3_w = NULL; - struct ggml_tensor * mm_3_b = NULL; - struct ggml_tensor * mm_4_w = NULL; - struct ggml_tensor * mm_4_b = NULL; - - // MobileVLM projection - struct ggml_tensor * mm_model_mlp_1_w; - struct ggml_tensor * mm_model_mlp_1_b; - struct ggml_tensor * mm_model_mlp_3_w; - struct ggml_tensor * mm_model_mlp_3_b; - struct ggml_tensor * mm_model_block_1_block_0_0_w; - struct ggml_tensor * mm_model_block_1_block_0_1_w; - struct ggml_tensor * mm_model_block_1_block_0_1_b; - struct ggml_tensor * mm_model_block_1_block_1_fc1_w; - struct ggml_tensor * mm_model_block_1_block_1_fc1_b; - struct ggml_tensor * mm_model_block_1_block_1_fc2_w; - struct ggml_tensor * mm_model_block_1_block_1_fc2_b; - struct ggml_tensor * mm_model_block_1_block_2_0_w; - struct ggml_tensor * mm_model_block_1_block_2_1_w; - struct ggml_tensor * mm_model_block_1_block_2_1_b; - struct ggml_tensor * mm_model_block_2_block_0_0_w; - struct ggml_tensor * mm_model_block_2_block_0_1_w; - struct ggml_tensor * mm_model_block_2_block_0_1_b; - struct ggml_tensor * mm_model_block_2_block_1_fc1_w; - struct ggml_tensor * mm_model_block_2_block_1_fc1_b; - struct ggml_tensor * mm_model_block_2_block_1_fc2_w; - struct ggml_tensor * mm_model_block_2_block_1_fc2_b; - struct ggml_tensor * mm_model_block_2_block_2_0_w; - struct ggml_tensor * mm_model_block_2_block_2_1_w; - struct ggml_tensor * mm_model_block_2_block_2_1_b; - - // MobileVLM_V2 projection - struct ggml_tensor * mm_model_mlp_0_w; - struct ggml_tensor * mm_model_mlp_0_b; - struct ggml_tensor * mm_model_mlp_2_w; - struct ggml_tensor * mm_model_mlp_2_b; - struct ggml_tensor * mm_model_peg_0_w; - struct ggml_tensor * mm_model_peg_0_b; - - // MINICPMV projection - struct ggml_tensor * mm_model_pos_embed_k; - struct ggml_tensor * mm_model_query; - struct ggml_tensor * mm_model_proj; - struct ggml_tensor * mm_model_kv_proj; - struct ggml_tensor * mm_model_attn_q_w; - struct ggml_tensor * mm_model_attn_q_b; - struct ggml_tensor * mm_model_attn_k_w; - struct ggml_tensor * mm_model_attn_k_b; - struct ggml_tensor * mm_model_attn_v_w; - struct ggml_tensor * mm_model_attn_v_b; - struct ggml_tensor * mm_model_attn_o_w; - struct ggml_tensor * mm_model_attn_o_b; - struct ggml_tensor * mm_model_ln_q_w; - struct ggml_tensor * mm_model_ln_q_b; - struct ggml_tensor * mm_model_ln_kv_w; - struct ggml_tensor * mm_model_ln_kv_b; - struct ggml_tensor * mm_model_ln_post_w; - struct ggml_tensor * mm_model_ln_post_b; -}; - -struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; - bool has_llava_projector = false; - bool has_minicpmv_projector = false; - - struct clip_vision_model vision_model; - projector_type proj_type = PROJECTOR_TYPE_MLP; - - float image_mean[3]; - float image_std[3]; - bool use_gelu = false; - int32_t ftype = 1; - - bool has_class_embedding = true; - bool has_pre_norm = true; - bool has_post_norm = false; - bool has_patch_bias = false; - - struct gguf_context * ctx_gguf; - struct ggml_context * ctx_data; - - std::vector buf_compute_meta; - - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; - - ggml_backend_t backend = NULL; - ggml_gallocr_t compute_alloc = NULL; - - struct clip_image_size * load_image_size; -}; - -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { - if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); - return nullptr; - } - - const auto & model = ctx->vision_model; - const auto & hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { - if (load_image_size == nullptr) { - load_image_size = clip_image_size_init(); - } - LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); - image_size_width = load_image_size->width; - image_size_height = load_image_size->height; - if (is_inf) { - image_size_width = imgs->data->nx; - image_size_height = imgs->data->ny; - } - } - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - int n_layer = hparams.n_layer; - const float eps = hparams.eps; - - const int batch_size = imgs->size; - - if (ctx->has_llava_projector || ctx->has_minicpmv_projector) { - GGML_ASSERT(batch_size == 1); - } - - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - - if (ctx->has_patch_bias) { - // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); - inp = ggml_add(ctx0, inp, model.patch_bias); - } - struct ggml_tensor * embeddings = inp; - struct ggml_tensor * pos_embed = nullptr; - - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (ctx->has_class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } - } - - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - embeddings = - ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); - - if (ctx->has_minicpmv_projector) { - int pos_w = image_size_width/patch_size; - int pos_h = image_size_height/patch_size; - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - ggml_set_name(pos_embed, "pos_embed"); - ggml_set_input(pos_embed); - } - - // pre-layernorm - if (ctx->has_pre_norm) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "pre_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); - } - - // loop over layers - if (ctx->has_minicpmv_projector) { - n_layer += 1; - } - for (int il = 0; il < n_layer - 1; il++) { - struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states - - //const size_t nb_q_w = model.layers[il].q_w->nb[0]; - - // layernorm1 - { - cur = ggml_norm(ctx0, cur, eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), - model.layers[il].ln_1_b); - } - - // self-attention - { - - struct ggml_tensor * Q = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor * K = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); - - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - - struct ggml_tensor * V = - ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); - - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_inplace(ctx0, KQ); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); - } - - // attention output - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); - - embeddings = cur; // embeddings = residual, cur = hidden_states - - // layernorm2 - { - cur = ggml_norm(ctx0, cur, eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); - - if (ctx->use_gelu) { - cur = ggml_gelu_inplace(ctx0, cur); - } else { - cur = ggml_gelu_quick_inplace(ctx0, cur); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); - - // residual 2 - cur = ggml_add(ctx0, embeddings, cur); - - embeddings = cur; - } - - // post-layernorm - if (ctx->has_post_norm) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "post_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); - } - - // llava projector - if (ctx->has_llava_projector) { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - // print_tensor_info(embeddings, "embeddings"); - - // llava projector - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); - // First LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), - model.mm_1_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); - - // Second LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), - model.mm_4_b); - } - else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - // MobileVLM projector - int n_patch = 24; - struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); - mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); - mlp_1 = ggml_gelu(ctx0, mlp_1); - struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); - mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] - - // block 1 - struct ggml_tensor * block_1 = nullptr; - { - // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); - // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); - - // layer norm - // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // hardswish - struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // residual - block_1 = ggml_add(ctx0, mlp_3, block_1); - } - - // block_2 - { - // stride = 2 - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); - - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // layer norm - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // hardswish - struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - // not sure the parameters is right for globalAvgPooling - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - - // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); - block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); - // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } - embeddings = block_1; - } - else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) - { - int n_patch = 24; - struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - } - else { - GGML_ABORT("fatal error"); - } - } - // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); - } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); - } - - { // attention - const int hidden_size = 4096; - const int d_head = 128; - const int n_head = hidden_size/d_head; - const int num_query = 96; - - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_inplace(ctx0, KQ); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); - } - else { - GGML_ASSERT(false); - } - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - ggml_free(ctx0); - - return gf; -} - -// read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { - struct ggml_context * meta = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, - }; - - struct gguf_context * ctx = gguf_init_from_file(fname, params); - if (!ctx) { - throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); - } - - if (verbosity >= 1) { - const int n_tensors = gguf_get_n_tensors(ctx); - const int n_kv = gguf_get_n_kv(ctx); - const int ftype = get_u32(ctx, KEY_FTYPE); - const std::string ftype_str = get_ftype(ftype); - const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION); - const std::string description = gguf_get_val_str(ctx, idx_desc); - const int idx_name = gguf_find_key(ctx, KEY_NAME); - if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug - const std::string name = gguf_get_val_str(ctx, idx_name); - LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); - } - LOG_TEE("%s: description: %s\n", __func__, description.c_str()); - LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); - LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); - LOG_TEE("\n"); - } - const int n_tensors = gguf_get_n_tensors(ctx); - - // kv - const int n_kv = gguf_get_n_kv(ctx); - LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", - __func__, n_kv, n_tensors, fname); - { - std::map n_type; - - for (int i = 0; i < n_tensors; i++) { - enum ggml_type type = gguf_get_tensor_type(ctx, i); - - n_type[type]++; - } - - LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); - for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx, i); - const enum gguf_type type = gguf_get_kv_type(ctx, i); - const std::string type_name = - type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i)) - : gguf_type_name(type); - - std::string value = gguf_kv_to_str(ctx, i); - const size_t MAX_VALUE_LEN = 40; - if (value.size() > MAX_VALUE_LEN) { - value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); - } - replace_all(value, "\n", "\\n"); - - LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); - } - - // print type counts - for (auto & kv : n_type) { - if (kv.second == 0) { - continue; - } - - LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); - } - } - - // data - size_t model_size = 0; - { - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - const size_t offset = gguf_get_tensor_offset(ctx, i); - enum ggml_type type = gguf_get_tensor_type(ctx, i); - struct ggml_tensor * cur = ggml_get_tensor(meta, name); - size_t tensor_size = ggml_nbytes(cur); - model_size += tensor_size; - if (verbosity >= 3) { - LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", - __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); - } - } - } - - clip_ctx * new_clip = new clip_ctx; - - // update projector type - { - int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); - if (idx != -1) { - const std::string proj_type = gguf_get_val_str(ctx, idx); - new_clip->proj_type = clip_projector_type_from_string(proj_type); - } else { - new_clip->proj_type = PROJECTOR_TYPE_MLP; - } - - if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { - if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { - new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; - } - } - } - -#ifdef GGML_USE_CUDA - new_clip->backend = ggml_backend_cuda_init(0); - LOG_TEE("%s: CLIP using CUDA backend\n", __func__); -#endif - -#ifdef GGML_USE_METAL - new_clip->backend = ggml_backend_metal_init(); - LOG_TEE("%s: CLIP using Metal backend\n", __func__); -#endif - -#ifdef GGML_USE_CANN - new_clip->backend = ggml_backend_cann_init(0); - LOG_TEE("%s: CLIP using CANN backend\n", __func__); -#endif - - - if (!new_clip->backend) { - new_clip->backend = ggml_backend_cpu_init(); - LOG_TEE("%s: CLIP using CPU backend\n", __func__); - } - - // model size and capabilities - { - int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); - new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx); - - idx = get_key_idx(ctx, KEY_HAS_VIS_ENC); - new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx); - - idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ); - if (idx != -1) { - new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); - } - - idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); - if (idx != -1) { - new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); - } - - // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search - - GGML_ASSERT(new_clip->has_vision_encoder); - GGML_ASSERT(!new_clip->has_text_encoder); - - idx = get_key_idx(ctx, KEY_USE_GELU); - new_clip->use_gelu = gguf_get_val_bool(ctx, idx); - - if (verbosity >= 1) { - LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); - LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); - } - } - - LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); - - // load tensors - { - std::vector read_buf; - struct ggml_init_params params = { - /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - new_clip->ctx_data = ggml_init(params); - if (!new_clip->ctx_data) { - LOG_TEE("%s: ggml_init() failed\n", __func__); - clip_free(new_clip); - gguf_free(ctx); - return nullptr; - } - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - LOG_TEE("cannot open model file for loading tensors\n"); - clip_free(new_clip); - gguf_free(ctx); - return nullptr; - } - - // add tensors to context - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor * t = ggml_get_tensor(meta, name); - struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t); - ggml_set_name(cur, name); - } - - // alloc memory and offload data - new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); - const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); - fin.seekg(offset, std::ios::beg); - if (!fin) { - LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); - clip_free(new_clip); - gguf_free(ctx); - return nullptr; - } - int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } - fin.close(); - } - - // vision model - if (new_clip->has_vision_encoder) { - // load vision model - auto & vision_model = new_clip->vision_model; - auto & hparams = vision_model.hparams; - hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); - hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); - hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); - hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); - hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); - hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); - hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); - hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); - - try { - int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); - int n = gguf_get_arr_n(ctx, idx); - const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); - for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { - hparams.image_grid_pinpoints[i] = pinpoints[i]; - } - if (n < 32) - hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & /*e*/) { - hparams.image_grid_pinpoints[0]=0; - } - - try { - int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); - strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); - } catch (std::runtime_error & /*e*/) { - strcpy(hparams.mm_patch_merge_type, "flat"); - } - - try { - hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } catch(const std::exception& /*e*/) { - hparams.image_crop_resolution = hparams.image_size; - } - - int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); - int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); - - const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); - const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); - - for (int i = 0; i < 3; ++i) { - new_clip->image_mean[i] = mean_data[i]; - new_clip->image_std[i] = std_data[i]; - } - - if (verbosity >= 2) { - LOG_TEE("\n%s: vision model hparams\n", __func__); - LOG_TEE("image_size %d\n", hparams.image_size); - LOG_TEE("patch_size %d\n", hparams.patch_size); - LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); - LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); - LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); - LOG_TEE("v_n_head %d\n", hparams.n_head); - LOG_TEE("v_n_layer %d\n", hparams.n_layer); - LOG_TEE("v_eps %f\n", hparams.eps); - LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - LOG_TEE("v_image_grid_pinpoints: "); - for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); - } - LOG_TEE("\n"); - LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); - - } - - try { - vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); - new_clip->has_class_embedding = true; - } catch (const std::exception& /*e*/) { - new_clip->has_class_embedding = false; - } - - try { - vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); - vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - new_clip->has_pre_norm = true; - } catch (std::exception & /*e*/) { - new_clip->has_pre_norm = false; - } - - try { - vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); - vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); - new_clip->has_post_norm = true; - } catch (std::exception & /*e*/) { - new_clip->has_post_norm = false; - } - - try { - vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); - new_clip->has_patch_bias = true; - } catch (std::exception & /*e*/) { - new_clip->has_patch_bias = false; - } - - try { - vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); - vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - } catch(const std::exception& /*e*/) { - LOG_TEE("%s: failed to load vision model tensors\n", __func__); - } - - // LLaVA projection - if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { - vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); - vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - try { - // Yi-type llava - vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); - vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & /*e*/) { } - try { - // missing in Yi-type llava - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & /*e*/) { } - try { - // Yi-type llava - vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); - vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & /*e*/) { } - try { - // Yi-type llava - vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); - vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & /*e*/) { } - try { - vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); - } catch (std::runtime_error & /*e*/) { } - } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { - // MobileVLM projection - vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); - vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); - vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); - vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); - vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); - vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); - vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); - vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); - vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); - vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); - vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); - vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); - vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); - vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); - vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); - vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); - vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); - vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); - vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); - vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); - vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); - vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); - vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); - vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } - else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) - { - // MobilVLM_V2 projection - vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); - vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); - vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); - vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); - vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); - vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); - } - else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { - // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); - vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); - vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); - vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); - vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); - vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); - vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); - vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); - vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); - vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); - vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); - vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); - vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); - vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); - vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); - vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); - vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); - vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); - vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); - } - else { - std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; - throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); - } - - vision_model.layers.resize(hparams.n_layer); - - for (int il = 0; il < hparams.n_layer; ++il) { - auto & layer = vision_model.layers[il]; - layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); - layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight")); - layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight")); - layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight")); - layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight")); - layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight")); - layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight")); - layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight")); - layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias")); - layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias")); - layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias")); - layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias")); - layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias")); - layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias")); - layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias")); - layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias")); - } - } - - ggml_free(meta); - - new_clip->ctx_gguf = ctx; - - // measure mem requirement and allocate - { - new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); - clip_image_f32_batch batch; - batch.size = 1; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); - ggml_gallocr_reserve(new_clip->compute_alloc, gf); - size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); - } - - return new_clip; -} - -void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { - ctx_clip->load_image_size = load_image_size; -} - -struct clip_image_size * clip_image_size_init() { - struct clip_image_size * load_image_size = new struct clip_image_size(); - load_image_size->width = 448; - load_image_size->height = 448; - return load_image_size; -} - -struct clip_image_u8 * clip_image_u8_init() { - return new clip_image_u8(); -} - -struct clip_image_f32 * clip_image_f32_init() { - return new clip_image_f32(); -} - -void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } -void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; - } -} -void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; - } -} - -static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { - img->nx = nx; - img->ny = ny; - img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), data, img->buf.size()); -} - -bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load(fname, &nx, &ny, &nc, 3); - if (!data) { - LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); - return false; - } - build_clip_img_from_data(data, nx, ny, img); - stbi_image_free(data); - return true; -} - -bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); - if (!data) { - LOG_TEE("%s: failed to decode image bytes\n", __func__); - return false; - } - build_clip_img_from_data(data, nx, ny, img); - stbi_image_free(data); - return true; -} - -// Linear interpolation between two points -inline float clip_lerp(float s, float e, float t) { - return s + (e - s) * t; -} -// Bilinear resize function -static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float x_ratio = static_cast(src.nx - 1) / target_width; - float y_ratio = static_cast(src.ny - 1) / target_height; - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float px = x_ratio * x; - float py = y_ratio * y; - int x_floor = static_cast(px); - int y_floor = static_cast(py); - float x_lerp = px - x_floor; - float y_lerp = py - y_floor; - - for (int c = 0; c < 3; c++) { - float top = clip_lerp( - static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), - static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - float bottom = clip_lerp( - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(clip_lerp(top, bottom, y_lerp)); - } - } - } -} - -// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not -static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { - dst->nx = src->nx; - dst->ny = src->ny; - dst->buf.resize(src->buf.size()); - - for (size_t i = 0; i < src->buf.size(); ++i) { - int c = i % 3; // rgb - dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; - } -} - -inline float clip(float x, float lower, float upper) { - return std::max(lower, std::min(x, upper)); -} - -static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; - - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float Cc; - float C[5]; - float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, jj; - int x, y; - float dx, dy; - float tx, ty; - - tx = (float)nx / (float)target_width; - ty = (float)ny / (float)target_height; - - // Bicubic interpolation; adapted from ViT.cpp, inspired from : - // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 - // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - - for (i = 0; i < target_height; i++) { - for (j = 0; j < target_width; j++) { - x = (int)(tx * j); - y = (int)(ty * i); - - dx = tx * j - x; - dy = ty * i - y; - - for (k = 0; k < 3; k++) { - for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - - C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; - - d0 = C[0] - C[1]; - d2 = C[2] - C[1]; - d3 = C[3] - C[1]; - a0 = C[1]; - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; - - const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); - } - } - } - } - - return true; -} - -// llava-1.6 type of resize_and_pad (black) -static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { - int target_width = target_resolution.first; - int target_height = target_resolution.second; - - float scale_w = static_cast(target_width) / image.nx; - float scale_h = static_cast(target_height) / image.ny; - - int new_width, new_height; - - if (scale_w < scale_h) { - new_width = target_width; - new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); - } else { - new_height = target_height; - new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); - } - - clip_image_u8 resized_image; - // bilinear_resize(image, resized_image, new_width, new_height); - bicubic_resize(image, resized_image, new_width, new_height); - - clip_image_u8 padded_image; - padded_image.nx = target_width; - padded_image.ny = target_height; - padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black - - // Calculate padding offsets - int pad_x = (target_width - new_width) / 2; - int pad_y = (target_height - new_height) / 2; - - // Copy the resized image into the center of the padded buffer - for (int y = 0; y < new_height; ++y) { - for (int x = 0; x < new_width; ++x) { - for (int c = 0; c < 3; ++c) { - padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; - } - } - } - image_output = std::move(padded_image); -} - -/** - * Selects the best resolution from a list of possible resolutions based on the original size. - * - * @param original_size The original size of the image in the format (width, height). - * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. - * @return The best fit resolution in the format (width, height). - */ -static std::pair select_best_resolution(const std::pair & original_size, const std::vector> & possible_resolutions) { - int original_width = original_size.first; - int original_height = original_size.second; - std::pair best_fit; - int max_effective_resolution = 0; - int min_wasted_resolution = std::numeric_limits::max(); - - for (const auto& resolution : possible_resolutions) { - int width = resolution.first; - int height = resolution.second; - float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); - int downscaled_width = static_cast(original_width * scale); - int downscaled_height = static_cast(original_height * scale); - int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); - int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); - if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { - max_effective_resolution = effective_resolution; - min_wasted_resolution = wasted_resolution; - best_fit = resolution; - } - } - - return best_fit; -} - -static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { - std::vector patches; - int width = image.nx; - int height = image.ny; - for (int i = 0; i < height; i += patch_size) { - for (int j = 0; j < width; j += patch_size) { - clip_image_u8 *patch = clip_image_u8_init(); - patch->nx = std::min(patch_size, width - j); - patch->ny = std::min(patch_size, height - i); - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = 0; y < patch->ny; ++y) { - for (int x = 0; x < patch->nx; ++x) { - for (int c = 0; c < 3; ++c) { - patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; - } - } - } - patches.push_back(patch); - } - } - return patches; -} - -static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); -} - -static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.first; - int height = original_size.second; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - int best_width = ensure_divide(width, patch_size); - int best_height = ensure_divide(height, patch_size); - return std::make_pair(best_width, best_height); -} - -static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width, height; - std::tie(width, height) = original_size; - int grid_x, grid_y; - std::tie(grid_x, grid_y) = grid; - - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); - - int grid_width = refine_width / grid_x; - int grid_height = refine_height / grid_y; - - // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - int best_grid_width, best_grid_height; - std::tie(best_grid_width, best_grid_height) = best_grid_size; - - // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) - std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) - return refine_size; -} - -inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - -static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - std::vector> candidate_grids; - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.emplace_back(m, split_grids_nums / m); - } - ++m; - } - } - - std::pair best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - return best_grid; -} - -// inspired from LLaVA-UHD: -// -> https://arxiv.org/pdf/2403.11703 -// -> https://github.com/thunlp/LLaVA-UHD -// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { - const std::pair original_size={img->nx,img->ny}; - const int original_width = img->nx; - const int original_height = img->ny; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - std::vector> images; - LOG_TEE("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); - - if (multiple <= 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8 * source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images[images.size()-1].push_back(source_image); - } - else if (multiple > 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8 * source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); - images[images.size()-1].push_back(source_image); - - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); - - auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8 * refine_image = clip_image_u8_init(); - bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - - LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); - - // split_to_patches - int width = refine_image->nx; - int height = refine_image->ny; - int grid_x = int(width / best_grid.first); - int grid_y = int(height / best_grid.second); - for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); - for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8 * patch = clip_image_u8_init(); - patch->nx = grid_x; - patch->ny = grid_y; - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = patches_i; y < patches_i + grid_y; ++y) { - for (int x = patches_j; x < patches_j + grid_x; ++x) { - const int i = 3 * (y * refine_image->nx + x); - const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); - patch->buf[j] = refine_image->buf[i]; - patch->buf[j+1] = refine_image->buf[i+1]; - patch->buf[j+2] = refine_image->buf[i+2]; - } - } - images[images.size()-1].push_back(patch); - } - } - } - return images; -} - -int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { - const int max_slice_nums=9; - const int scale_resolution=448; - const int original_width = ctx_clip->load_image_size->width; - const int original_height = ctx_clip->load_image_size->height; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - return best_grid.first; -} - -// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector -// res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { - if (clip_is_minicpmv(ctx)) { - std::vector> imgs = uhd_slice_image(img); - res_imgs->size = 0; - for (size_t i = 0; i < imgs.size(); ++i) { - res_imgs->size += imgs[i].size(); - } - res_imgs->data = new clip_image_f32[res_imgs->size]; - int idx = 0; - for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); - clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); - res_imgs->data[idx++] = *res; - clip_image_f32_free(res); - } - } - return true; - } - - bool pad_to_square = true; - if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); - return false; - } - auto & params = ctx->vision_model.hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { - pad_to_square = false; - } - // free the previous res_imgs if any set - if (res_imgs->size > 0) { - clip_image_f32_batch_free(res_imgs); - } - res_imgs->data = nullptr; - res_imgs->size = 0; - - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - - clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily - if (pad_to_square && img->nx != img->ny) { - int longer_side = std::max(img->nx, img->ny); - temp->nx = longer_side; - temp->ny = longer_side; - temp->buf.resize(3 * longer_side * longer_side); - const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) - - // fill with background color - for (size_t i = 0; i < temp->buf.size(); i++) { - temp->buf[i] = bc[i % 3]; - } - - // copy from the input image - for (int y = 0; y < img->ny; y++) { - for (int x = 0; x < img->nx; x++) { - const int i = 3 * (y * img->nx + x); - const int j = 3 * (y * temp->nx + x); - temp->buf[j] = img->buf[i]; - temp->buf[j+1] = img->buf[i+1]; - temp->buf[j+2] = img->buf[i+2]; - } - } - } else { - if (params.image_grid_pinpoints[0] != 0) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - std::vector> possible_resolutions; - for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { - possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); - } - std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); - // clip_image_save_to_bmp(*img, "input.bmp"); - resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 - // clip_image_save_to_bmp(*temp, "resized.bmp"); - // visually verify normalized image: - // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); - // { - // clip_image_u8 * temp2 = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*res, *temp2); - // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp"); - // clip_image_u8_free(temp2); - // } - - std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) - - clip_image_u8 *image_original_resize = clip_image_u8_init(); - // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - patches.insert(patches.begin(), image_original_resize); - // clip_image_f32_batch_init(patches.size()); - res_imgs->size = patches.size(); - res_imgs->data = new clip_image_f32[res_imgs->size]; - int num=0; - for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); - num++; - } - - for (size_t i = 0; i < patches.size(); i++) { - // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); - clip_image_u8_free(patches[i]); - } - - clip_image_u8_free(temp); - - return true; - } else { - temp->nx = img->nx; - temp->ny = img->ny; - temp->buf.resize(img->buf.size()); - memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); - } - } - - const int nx = temp->nx; - const int ny = temp->ny; - // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); - - const int nx2 = ctx->vision_model.hparams.image_size; - const int ny2 = ctx->vision_model.hparams.image_size; - clip_image_f32 * res = clip_image_f32_init(); - res->nx = nx2; - res->ny = ny2; - res->buf.resize(3 * nx2 * ny2); - - const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; - - const int nx3 = int(nx / scale + 0.5f); - const int ny3 = int(ny / scale + 0.5f); - - const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; - const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; - - for (int y = 0; y < ny3; y++) { - for (int x = 0; x < nx3; x++) { - for (int c = 0; c < 3; c++) { - // linear interpolation - const float sx = (x + 0.5f) * scale - 0.5f; - const float sy = (y + 0.5f) * scale - 0.5f; - - const int x0 = std::max(0, (int)std::floor(sx)); - const int y0 = std::max(0, (int)std::floor(sy)); - - const int x1 = std::min(x0 + 1, nx - 1); - const int y1 = std::min(y0 + 1, ny - 1); - - const float dx = sx - x0; - const float dy = sy - y0; - - const int j00 = 3 * (y0 * nx + x0) + c; - const int j01 = 3 * (y0 * nx + x1) + c; - const int j10 = 3 * (y1 * nx + x0) + c; - const int j11 = 3 * (y1 * nx + x1) + c; - - const float v00 = temp->buf[j00]; - const float v01 = temp->buf[j01]; - const float v10 = temp->buf[j10]; - const float v11 = temp->buf[j11]; - - const float v0 = v00 * (1.0f - dx) + v01 * dx; - const float v1 = v10 * (1.0f - dx) + v11 * dx; - - const float v = v0 * (1.0f - dy) + v1 * dy; - - const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); - - const int i = 3 * (y * nx3 + x) + c; - - res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; - } - } - } - clip_image_u8_free(temp); - - // { - // clip_image_u8 * temp2 = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*res, *temp2); - // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); - // clip_image_u8_free(temp2); - // } - // res_imgs.push_back(res); - - res_imgs->size = 1; - res_imgs->data = new clip_image_f32[res_imgs->size]; - res_imgs->data[0] = *res; - clip_image_f32_free(res); - - return true; -} - -ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->vision_model.image_newline; -} - -void clip_free(clip_ctx * ctx) { - ggml_free(ctx->ctx_data); - gguf_free(ctx->ctx_gguf); - - ggml_backend_buffer_free(ctx->params_buffer); - ggml_backend_free(ctx->backend); - ggml_gallocr_free(ctx->compute_alloc); - delete ctx; -} - -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - -int32_t clip_image_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.image_size; -} - -int32_t clip_patch_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.patch_size; -} - -int32_t clip_hidden_size(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.hidden_size; -} - -const char * clip_patch_merge_type(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.mm_patch_merge_type; -} - -const int32_t * clip_image_grid(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.image_grid_pinpoints; -} - -int clip_n_patches(const struct clip_ctx * ctx) { - const auto & params = ctx->vision_model.hparams; - - int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - - if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - n_patches = 96; - } - - return n_patches; -} - -static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { - assert(embed_dim % 2 == 0); - int H = pos.size(); - int W = pos[0].size(); - - std::vector omega(embed_dim / 2); - for (int i = 0; i < embed_dim / 2; ++i) { - omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); - } - - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - float out_value = pos[h][w] * omega[d]; - emb[h][w][d] = sin(out_value); - emb[h][w][d + embed_dim / 2] = cos(out_value); - } - } - } - - return emb; -} - -static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { - assert(embed_dim % 2 == 0); - std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) - std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) - - int H = emb_h.size(); - int W = emb_h[0].size(); - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - emb[h][w][d] = emb_h[h][w][d]; - emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; - } - } - } - return emb; -} - -static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { - int grid_h_size = image_size.first; - int grid_w_size = image_size.second; - - std::vector grid_h(grid_h_size); - std::vector grid_w(grid_w_size); - - for (int i = 0; i < grid_h_size; ++i) { - grid_h[i] = static_cast(i); - } - for (int i = 0; i < grid_w_size; ++i) { - grid_w[i] = static_cast(i); - } - - std::vector> grid(grid_h_size, std::vector(grid_w_size)); - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid[h][w] = grid_w[w]; - } - } - std::vector>> grid_2d = {grid, grid}; - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid_2d[0][h][w] = grid_h[h]; - grid_2d[1][h][w] = grid_w[w]; - } - } - - std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); - - int H = image_size.first; - int W = image_size.second; - std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; - } - } - - return pos_embed_2d; -} - -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); - return false; - } - - clip_image_f32_batch imgs{}; - imgs.size = 1; - imgs.data = img; - return clip_image_batch_encode(ctx, n_threads, &imgs, vec); -} - -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); - return false; - } - - int batch_size = imgs->size; - if (ctx->has_llava_projector) { - GGML_ASSERT(batch_size == 1); // TODO: support multiple images - } - if (ctx->has_minicpmv_projector) { - GGML_ASSERT(batch_size == 1); - } - - // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); - ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); - - // set inputs - const auto & model = ctx->vision_model; - const auto & hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { - image_size_width = imgs->data[0].nx; - image_size_height = imgs->data[0].ny; - } - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); - - { - struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - float * data = (float *)malloc(ggml_nbytes(inp_raw)); - - for (size_t i = 0; i < imgs->size; i++) { - const int nx = imgs->data[i].nx; - const int ny = imgs->data[i].ny; - if (!ctx->has_minicpmv_projector) { - GGML_ASSERT(nx == image_size && ny == image_size); - } - - const int n = nx * ny; - - for (int b = 0; b < batch_size; b++) { - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; - } - } - } - } - } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); - } - if (ctx->has_minicpmv_projector) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = std::floor(70.0*i/num_positions); - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - if(ctx->load_image_size==nullptr){ - ctx->load_image_size= clip_image_size_init(); - } - int pos_w = ctx->load_image_size->width/patch_size; - int pos_h = ctx->load_image_size->height/patch_size; - int embed_dim = 4096; - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;ihas_class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } - } - - { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; - } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } - } - - if (ggml_backend_is_cpu(ctx->backend)) { - ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); - } - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(ctx->backend)) { - ggml_backend_metal_set_n_cb(ctx->backend, n_threads); - } -#endif - - ggml_backend_graph_compute(ctx->backend, gf); - - // the last node is the embedding tensor - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; - - // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - - return true; -} - -bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { - ggml_type type = GGML_TYPE_Q4_1; - - assert(itype < GGML_TYPE_COUNT); - type = static_cast(itype); - - auto * ctx_clip = clip_model_load(fname_inp, 2); - - const auto & ctx_src = ctx_clip->ctx_gguf; - const auto & ctx_data = ctx_clip->ctx_data; - - auto * ctx_out = gguf_init_empty(); - gguf_set_kv(ctx_out, ctx_src); - gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); - gguf_set_val_u32(ctx_out, "general.file_type", itype); - - auto fout = std::ofstream(fname_out, std::ios::binary); - - const int n_tensors = gguf_get_n_tensors(ctx_src); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_src, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - gguf_add_tensor(ctx_out, cur); - } - - const size_t meta_size = gguf_get_meta_size(ctx_out); - for (size_t i = 0; i < meta_size; ++i) { - fout.put(0); - } - - // regexes of tensor names to be quantized - const std::vector k_names = { - ".*weight", - }; - - std::vector work(512); - std::vector conv_buf(512); - size_t total_size_org = 0; - size_t total_size_new = 0; - - for (int i = 0; i < n_tensors; ++i) { - const std::string name = gguf_get_tensor_name(ctx_src, i); - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); - - enum ggml_type new_type; - void * new_data; - size_t new_size; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // quantize only 2D tensors - quantize &= (ggml_n_dims(cur) == 2); - - if (quantize) { - new_type = type; - if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { - new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); - } - const size_t n_elms = ggml_nelements(cur); - float * f32_data; - - switch (cur->type) { - case GGML_TYPE_F32: - f32_data = (float *)cur->data; - break; - case GGML_TYPE_F16: - if (conv_buf.size() < n_elms) { - conv_buf.resize(n_elms); - } - for (size_t j = 0; j < n_elms; ++j) { - conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); - } - f32_data = (float *)conv_buf.data(); - break; - default: - LOG_TEE("Please use an input file in f32 or f16\n"); - gguf_free(ctx_out); - return false; - } - - if (work.size() < n_elms * 4) { - work.resize(n_elms * 4); - } - new_data = work.data(); - - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); - } else { - new_type = cur->type; - new_data = cur->data; - new_size = ggml_nbytes(cur); - } - const size_t orig_size = ggml_nbytes(cur); - total_size_org += orig_size; - total_size_new += new_size; - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); - fout.write((const char *)new_data, new_size); - size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; - for (size_t j = 0; j < pad; ++j) { - fout.put(0); - } - - LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, - orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); - } - - // go back to beginning of file and write the updated metadata - fout.seekp(0, std::ios::beg); - std::vector meta(meta_size); - gguf_get_meta_data(ctx_out, meta.data()); - fout.write((const char *)meta.data(), meta_size); - - fout.close(); - - clip_free(ctx_clip); - gguf_free(ctx_out); - - { - LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); - } - - return true; -} - -int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - return 4096; - } - - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); -} - -bool clip_is_minicpmv(const struct clip_ctx * ctx) { - return ctx->has_minicpmv_projector; -} diff --git a/examples/llava/clip.h b/examples/llava/clip.h deleted file mode 100644 index 2ff4d399..00000000 --- a/examples/llava/clip.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef CLIP_H -#define CLIP_H - -#include -#include - -#ifdef LLAMA_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef LLAMA_BUILD -# define CLIP_API __declspec(dllexport) -# else -# define CLIP_API __declspec(dllimport) -# endif -# else -# define CLIP_API __attribute__ ((visibility ("default"))) -# endif -#else -# define CLIP_API -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct clip_ctx; - -struct clip_image_size { - int width; - int height; -}; - -struct clip_image_u8_batch { - struct clip_image_u8 * data; - size_t size; -}; - -struct clip_image_f32_batch { - struct clip_image_f32 * data; - size_t size; -}; - -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); -CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); - -CLIP_API void clip_free(struct clip_ctx * ctx); - -CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); - -CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); - -// TODO: should be enum, not string -CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); - -CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); - -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); - -CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); -CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); - -CLIP_API struct clip_image_size * clip_image_size_init(); -CLIP_API struct clip_image_u8 * clip_image_u8_init (); -CLIP_API struct clip_image_f32 * clip_image_f32_init(); - -CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); -CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); -CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); -CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); - -CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); - -/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ -CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); - -/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); - -CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); - -CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); -CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); - -CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); - -CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); - -#ifdef __cplusplus -} -#endif - -#endif // CLIP_H diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py deleted file mode 100644 index 36f6b92f..00000000 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ /dev/null @@ -1,333 +0,0 @@ -import argparse -import os -import json -import re - -import torch -import numpy as np -from gguf import * -from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel - -TEXT = "clip.text" -VISION = "clip.vision" - - -def k(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: - if name in ( - "logit_scale", - "text_model.embeddings.position_ids", - "vision_model.embeddings.position_ids", - ): - return True - - if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: - return True - - if name.startswith("v") and not has_vision: - return True - - if name.startswith("t") and not has_text: - return True - - return False - - -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name - - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - - -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) -ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") -ap.add_argument("--text-only", action="store_true", required=False, - help="Save a text-only model. It can't be used to encode images") -ap.add_argument("--vision-only", action="store_true", required=False, - help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip-model-is-vision", action="store_true", required=False, - help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, - help="The clip model is from openclip (for ViT-SO400M type))") -ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") -ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) -# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 -# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) - -# with proper -args = ap.parse_args() - - -if args.text_only and args.vision_only: - print("--text-only and --image-only arguments cannot be specified at the same time.") - exit(1) - -if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") - -# output in the same directory as the model if output_dir is None -dir_model = args.model_dir - -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: - vocab = None - tokens = None -else: - with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - config = json.load(f) - if args.clip_model_is_vision: - v_hparams = config - t_hparams = None - else: - v_hparams = config["vision_config"] - t_hparams = config["text_config"] - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if args.use_f32: - ftype = 0 - -if args.clip_model_is_vision or args.clip_model_is_openclip: - model = CLIPVisionModel.from_pretrained(dir_model) - processor = None -else: - model = CLIPModel.from_pretrained(dir_model) - processor = CLIPProcessor.from_pretrained(dir_model) - -fname_middle = None -has_text_encoder = True -has_vision_encoder = True -has_llava_projector = False -if args.text_only: - fname_middle = "text-" - has_vision_encoder = False -elif args.llava_projector is not None: - fname_middle = "mmproj-" - has_text_encoder = False - has_llava_projector = True -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False -else: - fname_middle = "" - -output_dir = args.output_dir if args.output_dir is not None else dir_model -os.makedirs(output_dir, exist_ok=True) -output_prefix = os.path.basename(output_dir).replace("ggml_", "") -fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") - -fout.add_bool("clip.has_text_encoder", has_text_encoder) -fout.add_bool("clip.has_vision_encoder", has_vision_encoder) -fout.add_bool("clip.has_llava_projector", has_llava_projector) -fout.add_file_type(ftype) -model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) -fout.add_name(model_name) -if args.text_only: - fout.add_description("text-only CLIP model") -elif args.vision_only and not has_llava_projector: - fout.add_description("vision-only CLIP model") -elif has_llava_projector: - fout.add_description("image encoder for LLaVA") - # add projector type - fout.add_string("clip.projector_type", args.projector_type) -else: - fout.add_description("two-tower CLIP model") - -if has_text_encoder: - assert t_hparams is not None - assert tokens is not None - # text_model hparams - fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) - fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) - fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) - fout.add_token_list(tokens) - -if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) - fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) - fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) - block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) - # /** - # "image_grid_pinpoints": [ - # [ - # 336, - # 672 - # ], - # [ - # 672, - # 336 - # ], - # [ - # 672, - # 672 - # ], - # [ - # 1008, - # 336 - # ], - # [ - # 336, - # 1008 - # ] - # ], - # Flattened: - # [ - # 336, 672, - # 672, 336, - # 672, 672, - # 1008, 336, - # 336, 1008 - # ] - # * - # */ - if "image_grid_pinpoints" in v_hparams: - # flatten it - image_grid_pinpoints = [] - for pinpoint in v_hparams["image_grid_pinpoints"]: - for p in pinpoint: - image_grid_pinpoints.append(p) - fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) - if "image_crop_resolution" in v_hparams: - fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) - if "image_aspect_ratio" in v_hparams: - fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) - if "image_split_resolution" in v_hparams: - fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) - if "mm_patch_merge_type" in v_hparams: - fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) - if "mm_projector_type" in v_hparams: - fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) - - - if processor is not None: - image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] - image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue] - else: - image_mean = args.image_mean if args.image_mean is not None else default_image_mean - image_std = args.image_std if args.image_std is not None else default_image_std - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - -use_gelu = v_hparams["hidden_act"] == "gelu" -fout.add_bool("clip.use_gelu", use_gelu) - - -if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] - projector = torch.load(args.llava_projector) - for name, data in projector.items(): - name = get_tensor_name(name) - # pw and dw conv ndim==4 - if data.ndim == 2 or data.ndim == 4: - data = data.squeeze().numpy().astype(np.float16) - else: - data = data.squeeze().numpy().astype(np.float32) - - fout.add_tensor(name, data) - - print("Projector tensors added\n") - -state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] -for name, data in state_dict.items(): - if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): - # we don't need this - print(f"skipping parameter: {name}") - continue - - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) - - -fout.write_header_to_file() -fout.write_kv_data_to_file() -fout.write_tensors_to_file() -fout.close() - -print("Done. Output file: " + fname_out) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp deleted file mode 100644 index 889a6222..00000000 --- a/examples/llava/llava-cli.cpp +++ /dev/null @@ -1,340 +0,0 @@ -#include "ggml.h" -#include "log.h" -#include "common.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" - -#include "base64.hpp" - -#include -#include -#include - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { - LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); - eval_tokens(ctx_llama, embd_inp, n_batch, n_past); - return true; -} - -static const char * sample(struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); - static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { - ret = ""; - } else { - ret = llama_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} - -static const char* IMG_BASE64_TAG_BEGIN = ""; - -static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { - begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); - end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); -} - -static bool prompt_contains_image(const std::string& prompt) { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - return (begin != std::string::npos); -} - -// replaces the base64 image tag in the prompt with `replacement` -static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { - size_t img_base64_str_start, img_base64_str_end; - find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); - if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { - LOG_TEE("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); - return NULL; - } - - auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN); - auto base64_bytes_count = img_base64_str_end - base64_bytes_start; - auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count ); - - auto required_bytes = base64::required_encode_size(base64_str.size()); - auto img_bytes = std::vector(required_bytes); - base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); - - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); - if (!embed) { - LOG_TEE("%s: could not load image from base64 string.\n", __func__); - return NULL; - } - - return embed; -} - -static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - if (begin == std::string::npos || end == std::string::npos) { - return prompt; - } - auto pre = prompt.substr(0, begin); - auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END)); - return pre + replacement + post; -} - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { - - // load and preprocess the image - llava_image_embed * embed = NULL; - auto prompt = params->prompt; - if (prompt_contains_image(prompt)) { - if (!params->image.empty()) { - LOG_TEE("using base64 encoded image instead of command line image path\n"); - } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); - if (!embed) { - LOG_TEE("%s: can't load image from prompt\n", __func__); - return NULL; - } - params->prompt = remove_image_from_prompt(prompt); - } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); - if (!embed) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); - return NULL; - } - } - - return embed; -} - -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { - int n_past = 0; - - const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - - std::string system_prompt, user_prompt; - size_t image_pos = prompt.find(""); - if (image_pos != std::string::npos) { - // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = prompt.substr(0, image_pos); - user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } else { - // llava-1.5 native mode - system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; - user_prompt = prompt + "\nASSISTANT:"; - if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } - - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - - // generate the response - - LOG_TEE("\n"); - - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(ctx_llava->model),params->sparams); - if (!ctx_sampling) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); - } - - std::string response = ""; - for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); - response += tmp; - if (strcmp(tmp, "") == 0) break; - if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); - if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) - if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 - if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 - - fflush(stdout); - } - - llama_sampling_free(ctx_sampling); - printf("\n"); -} - -static struct llama_model * llava_init(gpt_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); - if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - - - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); - ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->ctx_clip = ctx_clip; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); - llama_backend_free(); -} - -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, {}); - return 1; - } - auto model = llava_init(¶ms); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init llava model\n", __func__); - return 1; - } - - if (prompt_contains_image(params.prompt)) { - auto ctx_llava = llava_init_context(¶ms, model); - - auto image_embed = load_image(ctx_llava, ¶ms, ""); - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_print_timings(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); - } else { - for (auto & image : params.image) { - auto ctx_llava = llava_init_context(¶ms, model); - - auto image_embed = load_image(ctx_llava, ¶ms, image); - if (!image_embed) { - std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; - return 1; - } - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_print_timings(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - } - - llama_free_model(model); - - return 0; -} diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp deleted file mode 100644 index 916d9dc4..00000000 --- a/examples/llava/llava.cpp +++ /dev/null @@ -1,480 +0,0 @@ -#include "clip.h" -#include "common.h" -#include "llama.h" -#include "llava.h" -#include "base64.hpp" - -#include -#include -#include -#include - -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - -struct clip_image_grid_shape { - int first; - int second; -}; - -/** - * Selects the best resolution from a list of possible resolutions based on the original size. - * - * @param original_size The original size of the image in the format (width, height). - * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. - * @return The best fit resolution in the format (width, height). - */ -static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { - int original_width = original_size.first; - int original_height = original_size.second; - - std::pair best_fit; - int max_effective_resolution = 0; - int min_wasted_resolution = std::numeric_limits::max(); - - for (const auto& resolution : possible_resolutions) { - int width = resolution.first; - int height = resolution.second; - float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); - int downscaled_width = static_cast(original_width * scale); - int downscaled_height = static_cast(original_height * scale); - int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); - int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); - if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { - max_effective_resolution = effective_resolution; - min_wasted_resolution = wasted_resolution; - best_fit = resolution; - } - } - - return best_fit; -} - -/** - * @brief Get the anyres image grid shape object - * - * @param image_size - * @param grid_pinpoints - * @param image_patch_size - * @return - */ -static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair & image_size, const std::vector> & grid_pinpoints, int image_patch_size) { - /** - Conversion from gguf flat array to vector: - std::vector> possible_resolutions; - for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { - possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); - } - */ - auto best_resolution = select_best_resolution(image_size, grid_pinpoints); - return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; -} - -// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { - struct { - struct ggml_context * ctx; - } model; - - const int32_t image_size = clip_image_size(ctx_clip); - const int32_t patch_size = clip_patch_size(ctx_clip); - - int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) - - int num_patches_width = grid_shape.first; // grid 1-4 - int num_patches_height = grid_shape.second; // grid 1-4 - - const size_t num_images = num_patches_width * num_patches_height + 1; - - // TODO: size calculation is not calculated - it's only tens of MB - size_t ctx_size = 0; - - { - ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features - ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); - } - - struct ggml_init_params params { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API - }; - - // Python reference code for full unpad: - /* - base_image_feature = image_feature[0] - image_feature = image_feature[1:] - image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() - image_feature = image_feature.flatten(1, 2).flatten(2, 3) - image_feature = unpad_image(image_feature, image_sizes[image_idx]) - image_feature = torch.cat(( - image_feature, - self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) - ), dim=-1) - image_feature = image_feature.flatten(1, 2).transpose(0, 1) - image_feature = torch.cat((base_image_feature, image_feature), dim=0) - */ - // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval. - // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet. - // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. - // Once all images are processed to prepended the base_image_features without any changes. - - // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) - /* - image_feature = image_feature.view(2, 2, 24, 24, 4096) - image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() - image_feature = image_feature.view(2, 24, 2, 24, 4096) - image_feature = image_feature.flatten(0, 3) - - // Reshape to 4D tensor by merging the last two dimensions - image_feature = image_feature.view(2, 2, 24, 24*4096) - image_feature = image_feature.permute(0, 2, 1, 3).contiguous() - image_feature = image_feature.view(-1, 4096) - */ - - model.ctx = ggml_init(params); - - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 - // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); - // fill it with the image embeddings, ignoring the base - for (size_t i = 1; i < num_images; i++) { - size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); - memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); - } - - struct ggml_cgraph * gf = ggml_new_graph(model.ctx); - size_t size_ele = ggml_type_size(GGML_TYPE_F32); - - struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, - num_patches_per_side * clip_n_mmproj_embd(ctx_clip), - num_patches_per_side, - num_patches_width, - num_patches_height, - size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), - size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side, - size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); - // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); - struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); - /** - At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings - image_feature = torch.cat(( - image_feature, - self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) - ), dim=-1) - * - */ - - // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); - struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); - // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); - ggml_build_forward_expand(gf, flatten); - ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; - - memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context - // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches - *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); - - // Debug: Test single segments - // Current findings: sending base image, sending a segment embedding all works similar to python - // However, permuted embeddings do not work yet (stride issue?) - // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context - // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context - // *n_img_pos_out=576; - - ggml_free(model.ctx); - return true; -} - -static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { - int width = image->nx; - int height = image->ny; - int num_patches = (height / patch_size) * (width / patch_size); - clip_image_f32 * patch = clip_image_f32_init(); - patch->nx = patch_size * num_patches; - patch->ny = patch_size; - patch->buf.resize(3 * patch->nx * patch->ny); - - int patch_index = 0; - - for (int i = 0; i < height; i += patch_size) { - for (int j = 0; j < width; j += patch_size) { - for (int pi = 0; pi < patch_size; ++pi) { - for (int pj = 0; pj < patch_size; ++pj) { - int input_index = ((i + pi) * width + (j + pj)) * 3; - int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; - patch->buf[output_index] = image->buf[input_index]; - patch->buf[output_index+1] = image->buf[input_index+1]; - patch->buf[output_index+2] = image->buf[input_index+2]; - } - } - patch_index++; - } - } - return patch; -} - -static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { - // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 - clip_image_f32_batch img_res_v; - img_res_v.size = 0; - img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - LOG_TEE("%s: unable to preprocess image\n", __func__); - delete[] img_res_v.data; - return false; - } - - const int64_t t_img_enc_start_us = ggml_time_us(); - - const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - - if (clip_is_minicpmv(ctx_clip)) { - std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - struct clip_image_size * load_image_size = clip_image_size_init(); - for (size_t i = 0; i < img_res_v.size; i++) { - const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); - int patch_size=14; - load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; - clip_add_load_image_size(ctx_clip, load_image_size); - const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); - return false; - } - const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - int n_img_pos_out = 0; - for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); - n_img_pos_out += clip_n_patches(ctx_clip); - } - *n_img_pos = n_img_pos_out; - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - load_image_size->width = img->nx; - load_image_size->height = img->ny; - clip_add_load_image_size(ctx_clip, load_image_size); - LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); - } - else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { - // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 - delete[] img_res_v.data; - if (!encoded) { - LOG_TEE("Unable to encode image\n"); - - return false; - } - } - else { - // spatial_unpad llava-1.6 type embedding - // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working - std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - for (size_t i = 0; i < img_res_v.size; i++) { - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside - if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); - return false; - } - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - const int32_t * image_grid = clip_image_grid(ctx_clip); - - std::vector> grid_pinpoints; - for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { - grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); - } - - // free all img_res_v - not needed anymore - delete[] img_res_v.data; - img_res_v.size = 0; - img_res_v.data = nullptr; - - const int32_t image_size = clip_image_size(ctx_clip); - - struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); - - int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); - *n_img_pos = n_img_pos_out; - - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - - // debug image/segment/normalization content: - // clip_image_u8 * tmp = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*image_feature, *tmp); - // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); - } - - LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); - - const int64_t t_img_enc_end_us = ggml_time_us(); - float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - - LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); - - return true; -} - -bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { - // make sure that the correct mmproj was used, i.e., compare apples to apples - int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); - auto n_image_embd = clip_n_mmproj_embd(ctx_clip); - if (n_image_embd != n_llama_embd) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); - return false; - } - return true; -} - -bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - int num_max_patches = 6; - if (clip_is_minicpmv(ctx_clip)) { - num_max_patches = 10; - } - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model - if (!image_embd) { - LOG_TEE("Unable to allocate memory for image embeddings\n"); - return false; - } - - int n_img_pos; - if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - LOG_TEE("%s: cannot encode image, aborting\n", __func__); - free(image_embd); - return false; - } - *image_embd_out = image_embd; - *n_img_pos_out = n_img_pos; - - return true; -} - -bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { - int n_embd = llama_n_embd(llama_get_model(ctx_llama)); - - for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { - int n_eval = image_embed->n_image_pos - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; - if (llama_decode(ctx_llama, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); - return false; - } - *n_past += n_eval; - } - return true; -} - -struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { - clip_image_u8 * img = clip_image_u8_init(); - if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { - clip_image_u8_free(img); - LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); - return NULL; - } - - float* image_embed = NULL; - int n_image_pos = 0; - bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); - if (!image_embed_result) { - clip_image_u8_free(img); - LOG_TEE("%s: coulnd't embed the image\n", __func__); - return NULL; - } - - clip_image_u8_free(img); - auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); - result->embed = image_embed; - result->n_image_pos = n_image_pos; - return result; -} - -static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { - auto file = fopen(path, "rb"); - if (file == NULL) { - LOG_TEE("%s: can't read file %s\n", __func__, path); - return false; - } - - fseek(file, 0, SEEK_END); - auto fileSize = ftell(file); - fseek(file, 0, SEEK_SET); - - auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data - if (buffer == NULL) { - LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); - perror("Memory allocation error"); - fclose(file); - return false; - } - errno = 0; - size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer - if (ferror(file)) { - die_fmt("read error: %s", strerror(errno)); - } - if (ret != (size_t) fileSize) { - die("unexpectedly reached end of file"); - } - fclose(file); // Close the file - - *bytesOut = buffer; - *sizeOut = fileSize; - return true; -} - -struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { - unsigned char* image_bytes; - long image_bytes_length; - auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); - if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, image_path); - return NULL; - } - - llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); - free(image_bytes); - - return embed; -} - -void llava_image_embed_free(struct llava_image_embed * embed) { - free(embed->embed); - free(embed); -} diff --git a/examples/llava/llava.h b/examples/llava/llava.h deleted file mode 100644 index b6feb302..00000000 --- a/examples/llava/llava.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef LLAVA_H -#define LLAVA_H - -#include "ggml.h" - -#ifdef LLAMA_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef LLAMA_BUILD -# define LLAVA_API __declspec(dllexport) -# else -# define LLAVA_API __declspec(dllimport) -# endif -# else -# define LLAVA_API __attribute__ ((visibility ("default"))) -# endif -#else -# define LLAVA_API -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct clip_ctx; -struct llava_image_embed { - float * embed; - int n_image_pos; -}; - -/** sanity check for clip <-> llava embed size match */ -LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); - -LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); - -/** build an image embed from image file bytes */ -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); -/** build an image embed from a path to an image filename */ -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -/** free an embedding made with llava_image_embed_make_* */ -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); - -/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ -LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/examples/llava/llava_surgery.py b/examples/llava/llava_surgery.py deleted file mode 100644 index 4f2da3be..00000000 --- a/examples/llava/llava_surgery.py +++ /dev/null @@ -1,38 +0,0 @@ -import argparse -import glob -import os -import torch - - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model") -args = ap.parse_args() - -# find the model part that includes the the multimodal projector weights -path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1] -checkpoint = torch.load(path) - -# get a list of mm tensor names -mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")] - -# store these tensors in a new dictionary and torch.save them -projector = {name: checkpoint[name].float() for name in mm_tensors} -torch.save(projector, f"{args.model}/llava.projector") - -# BakLLaVA models contain CLIP tensors in it -clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")] -if len(clip_tensors) > 0: - clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors} - torch.save(clip, f"{args.model}/llava.clip") - - - # added tokens should be removed to be able to convert Mistral models - if os.path.exists(f"{args.model}/added_tokens.json"): - with open(f"{args.model}/added_tokens.json", "w") as f: - f.write("{}\n") - - - -print("Done!") -print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py deleted file mode 100644 index 2d5b32fe..00000000 --- a/examples/llava/llava_surgery_v2.py +++ /dev/null @@ -1,159 +0,0 @@ -import argparse -import glob -import os -import torch -from safetensors import safe_open -from safetensors.torch import save_file -from typing import Any, ContextManager, cast - -# Function to determine if file is a SafeTensor file -def is_safetensor_file(file_path): - return file_path.endswith('.safetensors') - - -# Unified loading function -def load_model(file_path): - if is_safetensor_file(file_path): - tensors = {} - with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f: - for key in f.keys(): - tensors[key] = f.get_tensor(key).clone() - # output shape - print(f"{key} : {tensors[key].shape}") - return tensors, 'safetensor' - else: - return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' - - -# Unified saving function -def save_model(model, file_path, file_type): - if file_type == 'safetensor': - # safe_save(model, file_path) - save_file(model, file_path) - else: - torch.save(model, file_path) - - -# Adapted function to clean vision tower from checkpoint -def clean_vision_tower_from_checkpoint(checkpoint_path): - checkpoint, file_type = load_model(checkpoint_path) - # file_type = 'pytorch' - model_path = os.path.dirname(checkpoint_path) - print(f"Searching for vision tower tensors in {checkpoint_path}") - clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))] - - if len(clip_tensors) > 0: - print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") - # Adapted for file type - clip_path = os.path.join(model_path, "llava.clip") - - if os.path.exists(clip_path): - print(f"Loading existing llava.clip from {clip_path}") - existing_clip, _ = load_model(clip_path) - else: - print(f"Creating new llava.clip at {clip_path}") - existing_clip = {} - # Update existing_clip with new tensors, avoid duplicates - for name in clip_tensors: - simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name - print(f"Adding {simple_name} to llava.clip") - if simple_name not in existing_clip: - existing_clip[simple_name] = checkpoint[name] - - # Save the updated clip tensors back to llava.clip - save_model(existing_clip, clip_path, 'pytorch') - - # Remove the tensors from the original checkpoint - for name in clip_tensors: - del checkpoint[name] - - checkpoint_path = checkpoint_path - return True - return False - -def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): - newline_checkpoint_path = None - projector_checkpoint_path = None - - for path in checkpoint_paths: - checkpoint, _ = load_model(path) - if newline_criteria(checkpoint) and newline_checkpoint_path is None: - newline_checkpoint_path = path - if projector(checkpoint): - projector_checkpoint_path = path - - return newline_checkpoint_path, projector_checkpoint_path - -def newline_criteria(checkpoint): - return any(k.startswith("model.image_newline") for k in checkpoint.keys()) - -def proj_criteria(checkpoint): - return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) - - -# Command-line interface setup -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") -ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") -args = ap.parse_args() - -if args.clean_vision_tower: - # Generalized to handle both PyTorch and SafeTensors models - model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) - # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] - checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] - for projector_checkpoint_path in checkpoint_paths: - print(f"Cleaning {projector_checkpoint_path}") - if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): - print(f"No vision tower found in {projector_checkpoint_path}") - # we break once none is found, so far all models append them at the end - # break - print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") - -# Now we look for the projector in the last checkpoint -model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) -checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] -# last_checkpoint_path = checkpoint_paths[0] -# first_checkpoint_path = checkpoint_paths[-1] -newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) - -print(f"Taking projector from {projector_checkpoint_path}") -first_mm_tensors = [] -first_checkpoint = None -if newline_checkpoint_path is not None: - print(f"Taking newline from {newline_checkpoint_path}") - first_checkpoint, file_type = load_model(newline_checkpoint_path) - first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] - -# Load the checkpoint -mm_tensors = [] -last_checkpoint = None -if projector_checkpoint_path is not None: - last_checkpoint, file_type = load_model(projector_checkpoint_path) - mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] - -if len(mm_tensors) == 0: - if last_checkpoint is not None: - for k, v in last_checkpoint.items(): - print(k) - print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.") - print("No tensors found. Is this a LLaVA model?") - exit() - -print(f"Found {len(mm_tensors)} tensors to extract.") -print(f"Found additional {len(first_mm_tensors)} tensors to extract.") -# projector = {name: checkpoint.[name].float() for name in mm_tensors} -projector = {} -for name in mm_tensors: - assert last_checkpoint is not None - projector[name] = last_checkpoint[name].float() -for name in first_mm_tensors: - assert first_checkpoint is not None - projector[name] = first_checkpoint[name].float() - -if len(projector) > 0: - save_model(projector, f"{args.model}/llava.projector", 'pytorch') - -print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp deleted file mode 100644 index 022508a2..00000000 --- a/examples/llava/minicpmv-cli.cpp +++ /dev/null @@ -1,309 +0,0 @@ -#include "ggml.h" -#include "log.h" -#include "common.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" - -#include -#include -#include - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -static struct llama_model * llava_init(gpt_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); - if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); - if (params->n_ctx < 2048) { - // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); - ctx_params.n_ctx = 2048; - } else { - ctx_params.n_ctx = params->n_ctx; - } - - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); - llama_backend_free(); -} - -static struct clip_ctx * clip_init_context(gpt_params * params) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - return ctx_clip; -} - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { - LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); - return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); -} - -static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { - float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); - std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - - auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); - slice_embed->embed = image_embed; - slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); - llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); - llava_image_embed_free(slice_embed); -} - -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { - std::string system_prompt; - int idx = 0; - int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; - LOG_TEE("%s: image token past: %d\n", __func__, n_past); - eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (num_image_embeds > 1) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); - } - } - } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); -} - -static const char * sample(struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); - static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { - ret = ""; - } else { - ret = llama_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} - -static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto ctx_clip = clip_init_context(params); - auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); - if (!embeds) { - std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; - return NULL; - } - - // process the prompt - if (params->prompt.empty() && params->interactive == false) { - LOG_TEE("prompt should be given or interactive mode should be on"); - return NULL; - } - - auto model = llava_init(params); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); - return NULL; - } - const int64_t t_llava_init_start_us = ggml_time_us(); - auto ctx_llava = llava_init_context(params, model); - ctx_llava->ctx_clip = ctx_clip; - const int64_t t_llava_init_end_us = ggml_time_us(); - float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); - - const int64_t t_process_image_start_us = ggml_time_us(); - process_image(ctx_llava, embeds, params, n_past); - const int64_t t_process_image_end_us = ggml_time_us(); - float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); - - llava_image_embed_free(embeds); - return ctx_llava; -} - -static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ - std::string user_prompt = prompt; - if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; - - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); - // generate the response - - LOG_TEE("\n"); - - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(ctx_llava->model),params->sparams); - return ctx_sampling; -} - -static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ - - const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); - return tmp; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); - return 1; - } - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - if (params.mmproj.empty() || (params.image.empty())) { - gpt_params_print_usage(argc, argv, params); - show_additional_info(argc, argv); - return 1; - } - - for (auto & image : params.image) { - int n_past = 0; - auto ctx_llava = minicpmv_init(¶ms, image, n_past); - - if (!params.prompt.empty()) { - LOG_TEE("%s\n", params.prompt.c_str()); - LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; - bool have_tmp = false; - for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); - response += tmp; - if (strcmp(tmp, "") == 0){ - if(!have_tmp)continue; - else break; - } - if (strstr(tmp, "###")) break; // Yi-VL behavior - have_tmp = true; - printf("%s", tmp); - if (strstr(response.c_str(), "")) break; // minicpm-v - - fflush(stdout); - } - llama_sampling_free(ctx_sampling); - }else { - while (true) { - LOG_TEE(""); - std::string prompt; - std::getline(std::cin, prompt); - LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; - for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); - response += tmp; - if (strcmp(tmp, "") == 0) break; - if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp);// mistral llava-1.6 - if (strstr(response.c_str(), "")) break; // minicpm-v - fflush(stdout); - } - llama_sampling_free(ctx_sampling); - } - } - printf("\n"); - llama_print_timings(ctx_llava->ctx_llama); - - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - - return 0; -} diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py deleted file mode 100644 index 12cdd128..00000000 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ /dev/null @@ -1,382 +0,0 @@ -import argparse -import os -import json -import re - -import torch -import numpy as np -from gguf import * -from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig - -TEXT = "clip.text" -VISION = "clip.vision" - - -def add_key_str(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: - if name in ( - "logit_scale", - "text_model.embeddings.position_ids", - "vision_model.embeddings.position_ids", - ): - return True - - if has_minicpmv and name in ["visual_projection.weight"]: - return True - - if name.startswith("v") and not has_vision: - return True - - if name.startswith("t") and not has_text: - return True - - return False - - -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name - - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - - -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) -ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") -ap.add_argument("--text-only", action="store_true", required=False, - help="Save a text-only model. It can't be used to encode images") -ap.add_argument("--vision-only", action="store_true", required=False, - help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip-model-is-vision", action="store_true", required=False, - help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, - help="The clip model is from openclip (for ViT-SO400M type))") -ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") -ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) -# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 -# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) - -# with proper -args = ap.parse_args() - - -if args.text_only and args.vision_only: - print("--text-only and --image-only arguments cannot be specified at the same time.") - exit(1) - -if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") - -# output in the same directory as the model if output_dir is None -dir_model = args.model_dir - -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: - vocab = None - tokens = None -else: - with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if args.use_f32: - ftype = 0 - -# if args.clip_model_is_vision or args.clip_model_is_openclip: -# model = CLIPVisionModel.from_pretrained(dir_model) -# processor = None -# else: -# model = CLIPModel.from_pretrained(dir_model) -# processor = CLIPProcessor.from_pretrained(dir_model) - -default_vision_config = { - "hidden_size": 1152, - "image_size": 980, - "intermediate_size": 4304, - "model_type": "idefics2", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, - } -vision_config = Idefics2VisionConfig(**default_vision_config) -model = Idefics2VisionTransformer(vision_config) - -processor = None -# if model.attn_pool is not None: -# model.attn_pool = torch.nn.Identity() - -# model.blocks = model.blocks[:-1] -model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) - -fname_middle = None -has_text_encoder = True -has_vision_encoder = True -has_minicpmv_projector = False -if args.text_only: - fname_middle = "text-" - has_vision_encoder = False -elif args.minicpmv_projector is not None: - fname_middle = "mmproj-" - has_text_encoder = False - has_minicpmv_projector = True -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False -else: - fname_middle = "" - -output_dir = args.output_dir if args.output_dir is not None else dir_model -os.makedirs(output_dir, exist_ok=True) -output_prefix = os.path.basename(output_dir).replace("ggml_", "") -fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") - -fout.add_bool("clip.has_text_encoder", has_text_encoder) -fout.add_bool("clip.has_vision_encoder", has_vision_encoder) -fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) -fout.add_file_type(ftype) -if args.text_only: - fout.add_description("text-only CLIP model") -elif args.vision_only and not has_minicpmv_projector: - fout.add_description("vision-only CLIP model") -elif has_minicpmv_projector: - fout.add_description("image encoder for MiniCPM-V") - # add projector type - fout.add_string("clip.projector_type", "resampler") -else: - fout.add_description("two-tower CLIP model") - -if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", 448) - fout.add_uint32("clip.vision.patch_size", 14) - fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) - fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) - fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) - fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - block_count = 26 - fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) - - if processor is not None: - image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean - image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std - else: - image_mean = args.image_mean if args.image_mean is not None else default_image_mean - image_std = args.image_std if args.image_std is not None else default_image_std - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - -use_gelu = True -fout.add_bool("clip.use_gelu", use_gelu) - -def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) - out: (M, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2. - omega = 1. / 10000 ** omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - - -# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 -def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): - """ - grid_size: int of the grid height and width - return: - pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) - """ - if isinstance(grid_size, int): - grid_h_size, grid_w_size = grid_size, grid_size - else: - grid_h_size, grid_w_size = grid_size[0], grid_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - if cls_token: - pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) - return pos_embed - -def _replace_name_resampler(s, v): - if re.match("resampler.pos_embed", s): - return { - s: v, - re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), - } - if re.match("resampler.proj", s): - return { - re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), - re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), - } - if re.match("resampler.attn.in_proj_.*", s): - return { - re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], - re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], - re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], - } - return {s: v} - -if has_minicpmv_projector: - projector = torch.load(args.minicpmv_projector) - new_state_dict = {} - for k, v in projector.items(): - kvs = _replace_name_resampler(k, v) - for nk, nv in kvs.items(): - new_state_dict[nk] = nv - projector = new_state_dict - ftype_cur = 0 - for name, data in projector.items(): - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - if ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - fout.add_tensor(name, data) - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - - print("Projector tensors added\n") - -def _replace_name(s, v): - s = "vision_model." + s - if re.match("vision_model.embeddings.position_embedding", s): - v = v.unsqueeze(0) - return {s: v} - - return {s: v} - -state_dict = model.state_dict() -new_state_dict = {} -for k, v in state_dict.items(): - kvs = _replace_name(k, v) - for nk, nv in kvs.items(): - new_state_dict[nk] = nv -state_dict = new_state_dict -for name, data in state_dict.items(): - if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): - # we don't need this - print(f"skipping parameter: {name}") - continue - - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) - - -fout.write_header_to_file() -fout.write_kv_data_to_file() -fout.write_tensors_to_file() -fout.close() - -print("Done. Output file: " + fname_out) diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py deleted file mode 100644 index 2b6bce7c..00000000 --- a/examples/llava/minicpmv-surgery.py +++ /dev/null @@ -1,47 +0,0 @@ -import argparse -import os -import torch -from transformers import AutoModel, AutoTokenizer - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") -args = ap.parse_args() - -# find the model part that includes the the multimodal projector weights -model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) -checkpoint = model.state_dict() - -# get a list of mm tensor names -mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] - -# store these tensors in a new dictionary and torch.save them -projector = {name: checkpoint[name].float() for name in mm_tensors} -torch.save(projector, f"{args.model}/minicpmv.projector") - -clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] -if len(clip_tensors) > 0: - clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} - torch.save(clip, f"{args.model}/minicpmv.clip") - - # added tokens should be removed to be able to convert Mistral models - if os.path.exists(f"{args.model}/added_tokens.json"): - with open(f"{args.model}/added_tokens.json", "w") as f: - f.write("{}\n") - -config = model.llm.config -config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" -config.auto_map = { - "AutoConfig": "configuration_minicpm.MiniCPMConfig", - "AutoModel": "modeling_minicpm.MiniCPMModel", - "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", - "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", - "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" -} -model.llm.save_pretrained(f"{args.model}/model") -tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) -tok.save_pretrained(f"{args.model}/model") -# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") - -print("Done!") -print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt deleted file mode 100644 index dfe5fbe6..00000000 --- a/examples/llava/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ --r ../../requirements/requirements-convert_legacy_llama.txt ---extra-index-url https://download.pytorch.org/whl/cpu -pillow~=10.2.0 -torch~=2.2.1 -torchvision==0.17.1 diff --git a/examples/server/README.md b/examples/server/README.md index a56aeecd..1ecd5ca9 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -176,7 +176,7 @@ parallel: multi-modality: - --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md + --mmproj FILE path to a multimodal projector file. see examples/mtmd/README.md --image FILE path to an image file. use with multimodal models. Specify multiple times for batching backend: diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 439db888..d46429c2 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,3 @@ --r ../examples/llava/requirements.txt -r ../examples/server/bench/requirements.txt -r ../examples/server/tests/requirements.txt