ik_llama.cpp

jdelony/ik_llama.cpp

Fork 0

mirror of https://github.com/ikawrakow/ik_llama.cpp.git synced 2026-06-28 04:30:15 -05:00

Commit Graph

Select branches

Hide Pull Requests

fcp/checkpoint_min_var

fcp/context_shift_fix

fcp/fix_rpc_device

fix-recurrent-ckpt-prealloc

ik/FlashMLA-3

ik/adapt_iq1_iq2_bn

ik/adaptive_p

ik/adaptive_p_2

ik/add_extra_output_tensor

ik/add_forgotten_multi_add

ik/add_granite

ik/add_iq3ks_to_gguf

ik/add_jinja_file_help

ik/add_missing_enum_values_qwen3

ik/add_missing_gguf_constants

ik/add_missing_mmq_iq5ks

ik/add_mmq_id

ik/add_mtmd

ik/add_q60

ik/add_vq_help

ik/allow_empty_splits

ik/alternative_gemma4_assistant

ik/andrew_trellis

ik/another_mmq_id_fix

ik/apply_cuda_faster_iq3k

ik/arch_flags

ik/arm_better_r4

ik/attn_gemm

ik/avoid_cuda_mla_1

ik/avoid_per_step_ssm_copy

ik/avoid_recurrent_state_copy

ik/avx2_bf16

ik/avx2_flash_attn

ik/avx2_flash_attn_2

ik/avx2_q4_0_q8_0

ik/avx2_q5_0

ik/avx2_r4_tweaks

ik/backend_reduce_syncs

ik/bailingmoe2

ik/bailingmoe2_graph

ik/barrier

ik/bench_gp

ik/better_batched_processing

ik/better_cpu_fa_thread_strategy

ik/better_fa_glm45

ik/better_fa_masking

ik/better_fixup_stream_k

ik/better_flash_mla

ik/better_graph_pp

ik/better_graph_tg

ik/better_iq4_nl

ik/better_iqk_strategy

ik/better_model_info

ik/better_moe_small_batch

ik/better_mtp

ik/better_n_cpu_moe

ik/better_q40_kv_cache

ik/better_q40_kv_cache_cpu

ik/better_tg_fattn

ik/bf16_kv_cache

ik/bf16_r4

ik/biased_mmvq

ik/biased_qkv

ik/bitnet_adjustments

ik/bitnet_cuda

ik/bitnet_fused_unary

ik/bitnet_improve_metal

ik/bitnet_optional_scales

ik/bitnet_token_embedding_gpu

ik/bitnet_token_embedding_gpu_2

ik/bonsai_avx2

ik/bonsai_neon

ik/buffer_type_overrides

ik/bug_fixes_1926

ik/bug_missing_parentheses

ik/cached_graph

ik/change_default_fa_offset

ik/change_fmoe_fa_defaults

ik/change_q_pure

ik/chat_templates

ik/check_cpu_fa_supported_types

ik/check_for_empty_mask

ik/check_up_gate_fmoe

ik/clang_warnings

ik/cleanup_fudge_factors

ik/cohere2

ik/cohere2_moe_graph_opt

ik/cohere2_sm_graph

ik/compat_g4_assistant

ik/convert_i2s

ik/copyright

ik/correct_glm47_flash_gating_func

ik/correct_missing_gating_func_comments

ik/cpp_17

ik/cpu_argsort

ik/cpu_deepseek_fa

ik/cpu_fa_disable_mask_opt

ik/cpu_fa_dont_repack_tg

ik/cpu_fa_tg_glm4.5

ik/cpu_mla_all_quants

ik/cpu_moe_tg

ik/cpu_repeat

ik/cpu_swa_fa

ik/cpu_swa_v0

ik/cpu_swa_v1

ik/cpu_swa_v2

ik/cpu_topk_moe

ik/cuda_better_moe

ik/cuda_bf16

ik/cuda_ctx_mess

ik/cuda_faster_iq2k

ik/cuda_faster_iq4nl_kvcache

ik/cuda_faster_moe_tg

ik/cuda_fattn_Dk_Dv

ik/cuda_fix_quantized_flash_mla3

ik/cuda_flash_mla3

ik/cuda_flash_mla3_v2

ik/cuda_flash_mla_q8_0

ik/cuda_graphs_with_overrides

ik/cuda_grouped_topk

ik/cuda_iq1_m_r4

ik/cuda_iq1_s_r4

ik/cuda_iq2k_use_bperm1

ik/cuda_iq3k_use_bperm1

ik/cuda_iq4_k_r4

ik/cuda_iqk_ks_r4

ik/cuda_iqk_r4

ik/cuda_large_cpy

ik/cuda_lto

ik/cuda_mailine_fixes

ik/cuda_mla

ik/cuda_mla2

ik/cuda_mmq_iq2_k

ik/cuda_mmq_iq4_k

ik/cuda_mmq_iq4_ks

ik/cuda_native

ik/cuda_params

ik/cuda_q4_0_r4

ik/cuda_quantized_fmoe

ik/cuda_refactor_fattn

ik/cuda_rms_non_contiguous

ik/cuda_rope_back

ik/cuda_set_device

ik/cuda_swa2

ik/cuda_swa3

ik/cuda_topk_moe

ik/cuda_tracer

ik/cuda_use_bperm

ik/cuda_use_pinned_memory

ik/cuda_use_pinned_memory_2

ik/custom_q_rules

ik/debug_849

ik/debug_issue_721

ik/debug_issue_733

ik/dedup_stb_image

ik/deepseek_fa_opt

ik/deepseek_guarantee_rope_fusion

ik/deepseek_is_this_better

ik/deepseek_merge_qk

ik/deepseek_mla0

ik/deepseek_opt

ik/deepseek_rope_cache

ik/delta_dry

ik/delta_net

ik/delta_net_neon

ik/delta_net_tweaks

ik/dequant_gemm

ik/dequant_moe_gemm

ik/desperate_bug_fix_attempt

ik/dflash_fix_cpu

ik/dflash_fix_smgraph

ik/dflash_tweaks

ik/disable_add_fused_rms

ik/disable_experimental_code1

ik/disable_fusion_by_default

ik/disable_k_shift_smgraph

ik/disable_khadamard_if_not_power2

ik/disable_multi_add

ik/disable_or_enable_p2p

ik/disable_rope_cache

ik/disable_sm_row

ik/disable_smgraph_qwen35moe_mtp

ik/disable_smgraph_recurrent

ik/disable_some_fusion

ik/disable_vocab_debug

ik/disabled_cuda_graphs

ik/disallow_speculation_for_hybrid

ik/dont_abort_on_nccl_init_failure

ik/dont_split_output

ik/dup_experts_bias

ik/enable_all_iqk_fa_quants

ik/enable_cuda_graphs_with_reduce

ik/enable_fusion_by_default

ik/enable_mla3_in_crippled_ggufs

ik/enable_smgraph_mla_hybrid

ik/ernie_graph

ik/extra_reduce_types

ik/fa_512_512_turing

ik/fa_gemma4_assistant

ik/fa_mainline_compat

ik/fa_offset_2

ik/falcon3

ik/falcon3a

ik/falcon_edge

ik/fancy_simd_log

ik/fast_sampling_avx2

ik/faster_avx2_q40

ik/faster_cpu_fused_rms

ik/faster_cpu_fused_rms1

ik/faster_imatrix

ik/faster_iq2ks_quantize

ik/faster_iq3_iq5_quantize

ik/faster_iq4k

ik/faster_iq4k_quantize

ik/faster_iq4nl_quantize

ik/faster_moe_quantize

ik/faster_per_step_restore

ik/faster_q60_avx2

ik/fattn_Dk_Dv

ik/fattn_bf16

ik/fattn_enable_iq4_nl

ik/fattn_enable_q6_0

ik/fattn_fix_overflow

ik/fattn_gqa_10

ik/fattn_is_supported

ik/fattn_kq_max_offset

ik/fattn_kqv

ik/fattn_mma

ik/fattn_mma_gqa_16

ik/fattn_q35dense

ik/fattn_work_buffer

ik/fdn_fuse_silu_cpu

ik/fit_dense_model

ik/fix_1015

ik/fix_1055

ik/fix_1205

ik/fix_1237

ik/fix_1432

ik/fix_1438

ik/fix_1478

ik/fix_1961

ik/fix_300

ik/fix_358

ik/fix_412

ik/fix_447

ik/fix_499

ik/fix_538

ik/fix_596

ik/fix_827

ik/fix_Makefile

ik/fix_add_bf16_turing

ik/fix_after_883

ik/fix_again_cmake

ik/fix_annoying_warnings

ik/fix_arm_fa

ik/fix_avx2_gemm_mess

ik/fix_avx2_iq4_nl_r4

ik/fix_avx512_vs_fancy_simd

ik/fix_batched_cublas

ik/fix_bench_compile

ik/fix_bug_481

ik/fix_bug_added_in_1506

ik/fix_comma_pauses

ik/fix_compiler_warnings

ik/fix_contiguously_allocated

ik/fix_cpu_fa_bf16

ik/fix_cpu_fa_work_buffer_size

ik/fix_cuda_fa_race

ik/fix_cuda_memcpy_async

ik/fix_cuda_nans

ik/fix_cuda_scale_bug

ik/fix_debug_build

ik/fix_deepseek_fattn

ik/fix_deepseek_q80_cache

ik/fix_dequantize_when_requantizing

ik/fix_div_zero

ik/fix_dst_backend

ik/fix_dup_q

ik/fix_exp_shexp_split

ik/fix_experts_node_name

ik/fix_fa_192_128

ik/fix_fa_avx2_bug

ik/fix_fattn_odd_even

ik/fix_fattn_supported

ik/fix_flash_attn

ik/fix_fused_grouped_topk

ik/fix_gcc_arm

ik/fix_gemma3_vision

ik/fix_gemma4_hybrid

ik/fix_gemma4_mtp

ik/fix_gemma4_quantized_KV_cache_cuda

ik/fix_gemma4_quantized_kv_cache_cpu

ik/fix_gemma_e4b

ik/fix_ggml_common

ik/fix_ggml_nbytes

ik/fix_glm4_attn

ik/fix_glm_mtp

ik/fix_glm_mtp_accept

ik/fix_glm_mtp_smgraph

ik/fix_gpt_oss_partial_offload

ik/fix_graph_parallel_partial_offload

ik/fix_grt_bf16

ik/fix_hadamard_bug

ik/fix_hybrid_detection

ik/fix_hybrid_graph_muge

ik/fix_imatrix_check

ik/fix_imatrix_nonsense

ik/fix_iq4k_avx2

ik/fix_iqk_for_strange_numrows

ik/fix_jinja

ik/fix_kimi2_parse

ik/fix_kld

ik/fix_kq

ik/fix_llama4_attention

ik/fix_llama_kv_cache_cell_max

ik/fix_metal_fa

ik/fix_minimax_hadamard

ik/fix_misleading_quantize_error

ik/fix_missing_bf16_avx512

ik/fix_missing_dry

ik/fix_missing_end

ik/fix_mistral3_smgraph

ik/fix_mla1

ik/fix_mla_imatrix

ik/fix_mla_smgraph_cache_load_save

ik/fix_mmproj_bf16_cpu

ik/fix_mmq_id

ik/fix_mmq_overflow

ik/fix_mmvq_bug

ik/fix_mtp_discarding

ik/fix_mtp_no_gr

ik/fix_mtp_plus_muge

ik/fix_mul_mat_16

ik/fix_multiple_choice

ik/fix_neon_build

ik/fix_neon_legacy_quants

ik/fix_neon_q82

ik/fix_no_iqk_build

ik/fix_no_p2p_case

ik/fix_partial_ngl_smgraph

ik/fix_partial_ngl_smgraph_mla

ik/fix_partial_offload_crash

ik/fix_perf_regression

ik/fix_pr_261

ik/fix_pr_842

ik/fix_q35moe_mtp_smgraph

ik/fix_q41_q51_arm

ik/fix_q5_0_fa

ik/fix_q6_0_dequantize

ik/fix_q80_avx2_2

ik/fix_q80_avx2_mess

ik/fix_q80_moe_avx2

ik/fix_quantize_kt

ik/fix_quantized_k_cache

ik/fix_quantized_kv_nofa

ik/fix_qwen35_smgraph_hybrid

ik/fix_qwen35moe_low_mtp_acceptance

ik/fix_qwen_mtp_warmup

ik/fix_reduce_race

ik/fix_reduce_windows

ik/fix_repacked_legacy_quants

ik/fix_replace_all

ik/fix_requantize_interleaved

ik/fix_requantize_interleaved_2

ik/fix_ring_reduction

ik/fix_rope_norm_fast_cuda

ik/fix_rpc_off

ik/fix_rpc_off2

ik/fix_rtr_mqkv

ik/fix_ser

ik/fix_ser_cuda

ik/fix_sm_graph_with_vision

ik/fix_standard_attention_cpu

ik/fix_sync_logic

ik/fix_the_fix

ik/fix_typo

ik/fix_unknown_tensor_type

ik/fix_up_gate_mmq_not_supported

ik/fix_vulkan_required

ik/fix_windows

ik/fix_windows_avx512

ik/fix_windows_no_omp

ik/fix_xeon_6226R

ik/flash_mla

ik/flash_mla2_cuda_no_f32

ik/flash_mla2_no_f32

ik/flash_mla_2

ik/flash_mla_4

ik/flash_precision

ik/flax-vector-conversions

ik/format_name

ik/fuse_add_add_fused_rms

ik/fuse_add_fused_rms

ik/fuse_bias_only_tg

ik/fuse_biased_qkv

ik/fuse_kvcache_copy

ik/fuse_merge_up_gate_exps

ik/fuse_moe_up_gate

ik/fuse_mul_mat_scale

ik/fuse_qkv

ik/fuse_rms_rms_add

ik/fuse_ssm_silu_neon

ik/fused_bailingmoev2

ik/fused_delta_net

ik/fused_delta_net_2

ik/fused_delta_net_3

ik/fused_delta_net_3a

ik/fused_delta_net_avx512

ik/fused_ffn_up_gate

ik/fused_mul_multiadd

ik/fused_mul_unary

ik/fused_mul_unary_1

ik/fused_norm

ik/fused_rms_norm

ik/fused_rms_rms

ik/fused_rope_rope

ik/fused_softcap_softmax

ik/fused_up_gate_unary

ik/g4_assistant_smgraph

ik/gemm_4d

ik/gemm_iq1s

ik/gemm_neon_1bit

ik/gemm_neon_iqk

ik/gemm_neon_iquants

ik/gemm_neon_kquants

ik/gemm_neon_legacy

ik/gemma3

ik/gemma3_mqkv_rcache

ik/gemma4

ik/gemma4_12B_smgraph

ik/gemma4_e2b_tweaks

ik/gemma4_fuse_logits

ik/gemma4_gp_bugfix

ik/gemma4_mtmd_blindness

ik/gemma4_mtp_avoid_f32_cast

ik/gemma4_mtp_extra_output

ik/gemma4_mtp_graph_reuse

ik/gemma4_mtp_last_device

ik/gemma4_routing

ik/gemma4_tokenizer_fixes

ik/gemma4_vision

ik/gemma_output_tensor

ik/gemma_q80_kvcache

ik/gemv_bf16_r16

ik/gguf_bool_arrays

ik/gguf_py_add_maxfp4

ik/gguf_py_changes_for_np2.0

ik/glm45_tg_fa_hack

ik/glm45_tg_very_fast

ik/glm47_fa_2

ik/glm47_tg_fa_hack

ik/glm5

ik/glm5_mtp

ik/glm_flash

ik/glm_mtp_warmup

ik/gpt-oss

ik/gpt_oss_graph

ik/gpu_layers

ik/gpu_layers_2

ik/gpu_layers_3

ik/graph_alloc

ik/graph_better_splits

ik/graph_parallel_tweak

ik/graph_reuse

ik/graph_reuse_field

ik/graph_reuse_on

ik/hadamard_512

ik/hadamard_block_size

ik/handle_incompatible_deepseek_ggufs

ik/handle_split_cache

ik/handle_think_no_space

ik/hide_imatrix

ik/honor_manual_splits

ik/hsums

ik/huihui_57B

ik/hunyuan_graph

ik/ignore_nextn

ik/ignore_nextn_layers

ik/imatrix_ffn_gate

ik/imatrix_fused_up_gate

ik/imatrix_lsim

ik/improve_iq1m

ik/improve_iq2_xs

ik/improve_iq2ks

ik/improve_mmq

ik/interleaved_guards

ik/iq1_kt

ik/iq1_m_neon

ik/iq1_m_r4

ik/iq1_s_checks

ik/iq1_s_gemm

ik/iq1_s_r4

ik/iq1_s_r4_k128

ik/iq1_s_r4_neon

ik/iq1_tn

ik/iq1_tn_cuda

ik/iq1_tn_metal

ik/iq1bn_metal

ik/iq1m_gemm

ik/iq2_bn_r4

ik/iq2_k

ik/iq2_k_r4

ik/iq2_k_tweak

ik/iq2_kl

ik/iq2_s_r4

ik/iq2_tn

ik/iq2_tn_as_iq2_bn

ik/iq2_tn_avx2

ik/iq2_tn_faster_pp

ik/iq2_xs_r4

ik/iq2_xxs_gemm

ik/iq2_xxs_r4

ik/iq2k_experiments

ik/iq2ks_experiments

ik/iq3_k_r4_v2

ik/iq3_ks

ik/iq3_ks_v2

ik/iq3_s_gemm

ik/iq3_s_r4

ik/iq3_s_r4_v2

ik/iq3_xxs_gemm

ik/iq3_xxs_r4

ik/iq3_xxs_r4_v2

ik/iq4_k

ik/iq4_k_r4

ik/iq4_k_r4_avx2

ik/iq4_k_tweaks

ik/iq4_k_xxs

ik/iq4_knn

ik/iq4_ks_r4

ik/iq4_kss

ik/iq4_kss_improvements

ik/iq4_nl_cache

ik/iq4_nl_x4

ik/iq4_xs_r4

ik/iq4_xs_r4_avx2

ik/iq4_xs_r8

ik/iq4_xs_r8_v2

ik/iq4kss_experiments

ik/iq4nl_kv_cache

ik/iq5_k_r4

ik/iq5_ks

ik/iq5_ks_r4

ik/iq6_k

ik/iq_gemv_tweaks

ik/iqk_fattn_all_quants

ik/iqk_gemm

ik/iqk_mmvq_opt

ik/iqk_q_improvements

ik/is_this_better_for_multi_gpu

ik/issue_214

ik/issue_217

ik/issue_224

ik/issue_230

ik/k_cache_hadamard

ik/k_cache_hadamard_cuda

ik/keep_mmap_with_no_pinned

ik/kq_fused_softmax

ik/kq_mask

ik/kq_mask_padding_64

ik/l4_rms_norm

ik/laguna_sm_graph

ik/legacy_gemm

ik/limit_amb

ik/llama4

ik/llama_bench_fit

ik/llama_bench_mla3

ik/llama_bench_n_cpu_moe

ik/llama_bench_overrides

ik/llama_bench_rcache

ik/llama_bench_sas

ik/llama_bench_sm_arg

ik/llama_bench_tgb

ik/llama_hparams_add_mla

ik/llama_warnings

ik/log_probs_on_crash

ik/logging_cleanup

ik/make_biased_gemv_optional

ik/make_qx_quants

ik/mask_mt

ik/max_nodes

ik/max_nodes_again

ik/measure_barriers

ik/mellum_sm_graph

ik/merge_Aug_12_2024

ik/merge_July_26_2024

ik/merge_only_qk

ik/merge_qkv

ik/merge_up_gate_exps_2

ik/merge_up_gate_exps_3

ik/metal_bf16

ik/metal_faster_iq4ks

ik/metal_fattn_update

ik/metal_fix_iq2k

ik/metal_fix_iq3k

ik/metal_moe

ik/metal_new_trellis

ik/mimo2

ik/mimo2.5

ik/mimo2_4_gpus

ik/mimo2_graph

ik/minimax2_very_fast

ik/minimax_graph_minor

ik/minimaxm3_smgraph

ik/ministral3

ik/minmax2_sm_graph

ik/minor_delta_tweak

ik/minor_iq2ks_tweak

ik/minor_mtp1

ik/minor_silu

ik/mistral3_large

ik/mistral3_std_attn

ik/mistral4

ik/mistral4_cpu_fa

ik/mixd_kv_cache

ik/mla

ik/mla2_q80_cache

ik/mla2_q80_cache_cpu

ik/mla=3_by_default

ik/mla_add_extra_nodes

ik/mla_fixes

ik/mla_guard

ik/mla_imatrix

ik/mla_no_transposed_cache

ik/mla_q80

ik/mla_smgraph

ik/mmq_id_thresh

ik/mmq_iq_ks_r4

ik/mmq_show_error_details

ik/mmq_to_cublas

ik/mmvq_args

ik/mmvq_fuse_bias

ik/mmvq_type_supported

ik/model_fit

ik/moe_fused_unary

ik/moe_offload_strategy

ik/more_set_device

ik/mtmd_kq_type

ik/mtmd_reduce_memory_use

ik/mtp_accept_only_last_logits

ik/mtp_async_copies

ik/mtp_per_step_smgraph

ik/mtp_requantize_output

ik/mtp_reuse_graphs

ik/mtp_reuse_graphs_2

ik/mtp_tweaks1

ik/mtp_tweaks_2

ik/mul_mat_bf16

ik/mul_mat_ext

ik/multi_add

ik/mv_q4_0_r4

ik/mxfp4

ik/n_cpu_moe

ik/nccl1

ik/nccl2

ik/nccl3

ik/nccl3_async

ik/neon_bf16

ik/neon_flash_attention_2

ik/neon_flash_attention_3

ik/neon_improve_legacy_quants

ik/neon_iq3_kt

ik/new_iq1bn

ik/new_iq2kt

ik/new_iq2kt_v2

ik/new_iq4kt

ik/new_trellis_2

ik/no_KV_for_unused_layers

ik/non_contiguous_rope

ik/offline_repack

ik/offline_repack_patterns

ik/offload_policy

ik/ooae2

ik/ooae_on_by_default

ik/opt_kt_quants

ik/option_cpu_fa

ik/option_to_disable_cuda_fusion

ik/optional_yarn_log_multiplier

ik/ot_ffn_gate_up

ik/p2p_cpy_set_device

ik/per_gpu_fit_margin

ik/per_row_scale

ik/per_step_conv_states

ik/phi3.5_tweaks

ik/pickup_13095

ik/pinned_suggest

ik/play_with_barrier

ik/poc_tp

ik/poc_tp_glm4.5

ik/pre_merged_up_gate

ik/prepare_wk_b

ik/purge_blas

ik/q2_k_r4

ik/q35_tweaks

ik/q3_k_r4

ik/q3next_concat

ik/q3next_concat_cpu

ik/q3next_cuda_graphs

ik/q3next_opt2

ik/q3next_opt3

ik/q4_0_r4

ik/q4_0_r8

ik/q4_k_gemm

ik/q4_k_r4

ik/q4_k_r4_v2

ik/q4_k_r4_v3

ik/q5_0_r4

ik/q5_k_r4

ik/q60_mmq

ik/q6_0_r4

ik/q6_k_gemm

ik/q6_k_r4

ik/q8_0_r4

ik/q8_KV

ik/q8_k_r16

ik/q8_k_r8

ik/q8_k_r8_avx512

ik/qkvz_tweak

ik/qkvz_tweak1

ik/qmix_tweaks

ik/qmix_tweaks_2

ik/qstats

ik/quantization_tweaks

ik/quantize_dry_run

ik/quantize_ffn_gate_inp

ik/quantize_fused_up_gate

ik/quantize_gemma4

ik/quantize_mmproj

ik/quantize_options

ik/quantize_q8k_avx2

ik/quantize_stats

ik/qwen3.5_vision

ik/qwen35_model_types

ik/qwen35_mtp_smgraph

ik/qwen35_std_attn

ik/qwen35dense

ik/qwen35moe

ik/qwen35moe_muge

ik/qwen3_graph

ik/qwen3next

ik/qwen3vl_graph

ik/qwen_mtp_inp_out_ids

ik/qx_0_r4_avx2

ik/qx_k_b32_avx2

ik/r4_faster_zen4

ik/r4_neon

ik/r4_nrcy_16

ik/really_fix_rope_cache

ik/reduce_compute_buffers

ik/reduce_make_copies

ik/reduce_mla3_compute_buffer_size

ik/reduce_no_nccl

ik/reduce_race_quick_fix

ik/refactor_graphs

ik/refactor_iqk

ik/refactor_llama.cpp

ik/remove_iqk_option

ik/remove_kv_l

ik/remove_llamafile

ik/remove_scary_warning

ik/remove_unnecessary_calls

ik/remove_unnessessary_ids_copy

ik/rename_4_8

ik/rename_iq4_nl_x4

ik/reorg_mmvq_and_fuse_bias

ik/repack_also_experts

ik/repack_f16

ik/reset_1st_recurrent_graph

ik/revert_0bf4d997

ik/revert_1496

ik/revert_1687

ik/revert_739

ik/revert_delta_net_3

ik/revert_dflash_swa_opt

ik/reverts

ik/ring_reduce

ik/rm_Makefile

ik/rms_block_size

ik/rng_sampling

ik/rope_cache

ik/rtr_plus_muge

ik/run_time_repack

ik/sampling-top-n-sigma

ik/sampling-xtc

ik/sampling_refactor_sorting

ik/sampling_top_n_sigma

ik/sanitize_importance_iqk

ik/sanitize_importance_kt_quants

ik/sched_copy_experts

ik/sched_max_copies=1

ik/server_send_done

ik/set_draft_input_hidden_state

ik/shexps_better_hybrid

ik/simplify_delta_net

ik/simplify_delta_net_2

ik/skip_get_rows

ik/skip_noop_barriers

ik/skip_rowids_computation

ik/skip_unnecessary_quantize

ik/slightly_better_fdn

ik/slightly_better_graph_split_strategy

ik/sm_graph_cuda_graphs

ik/sm_graph_delta_net

ik/sm_graph_disable_cuda_graphs

ik/sm_graph_gemma4_moe

ik/sm_graph_max_gpu

ik/sm_graph_muge

ik/sm_graph_partial_offload

ik/sm_graph_pre_merged_up_gate

ik/sm_graph_q35

ik/sm_graph_q3next

ik/sm_graph_qwen35moe

ik/sm_graph_rearrange

ik/sm_graph_seedoss

ik/sm_graph_step35

ik/sm_graph_sync

ik/smart_expert_selection

ik/smollm3

ik/softcap

ik/softcap_minor

ik/split_graph_2

ik/split_mode_f32

ik/ssm_conv4_avx2

ik/ssm_conv4_silu

ik/standardize_gemma4

ik/step35

ik/step35_compat

ik/support_gigachat

ik/sweep_bench_n_predict

ik/sweep_bench_nrep

ik/sweep_bench_warmup

ik/swiglu

ik/sync_fa

ik/tensor_names

ik/tensor_override_honor_mmap

ik/test_q80_NaNs

ik/test_thp

ik/tg_tweaks

ik/topk_moe_fuse_bias

ik/topk_moe_with_norm

ik/trellis_bf16

ik/trellis_metal

ik/trellis_neon

ik/trellis_opt

ik/trinet

ik/try_authors

ik/try_cuda_graphs

ik/try_fa_no_q80_repack

ik/try_fix_1014

ik/try_fix_1201

ik/try_fix_1222

ik/try_fix_367

ik/try_fix_367_v2

ik/try_fix_690

ik/try_fix_772

ik/try_fix_854

ik/try_fix_974

ik/try_fix_avx2_fa

ik/try_fix_many_gpus

ik/try_fix_many_gpus_2

ik/try_grouped_topk_playing1

ik/try_minimax_better_sm_graph

ik/try_remove_cpy_indirection

ik/try_split_mla

ik/try_split_offloaded_moe_up_gate

ik/try_svd

ik/try_trellis

ik/undo_1049_if_tensor_overrides

ik/undo_1421

ik/undo_sync_reduction

ik/update_authors

ik/update_license

ik/use_bf16_when_no_mmq

ik/use_mmq_id_for_moe

ik/use_q8_2

ik/v_cache_hadamard

ik/validate_quants_on_load

ik/vendor

ik/vulkan1

ik/vulkan_again

ik/vulkan_disable_fused_ops

ik/vulkan_disable_multi_add

ik/vulkan_fattn

ik/vulkan_fused_mul_unary

ik/vulkan_fused_rms

ik/vulkan_multi_add

ik/warn_pinned_alloc

ik/wip_sync_llama

ik/worst_graph_tokens

ik/zen4_faster_iq4ks_iq5ks

ik/zen4_flash_attn

ik/zen4_flash_attn_2

ik/zen4_flash_attn_bf16

ik/zen4_iq4_xs_r4

ik/zen4_repack_f16

ikawrakow-patch-1

ikawrakow-patch-1-1

ikawrakow-patch-2

main

revert-1696-fix/recurrent-state-reset

s6/MLA_prompt_save_restore_fix

s6/bitnet2b_2501

s6/bitnet_name_update

s6/cache_default

s6/deci_support

s6/docs_update

s6/dots

s6/fix_kshift_crash

s6/fix_prompt_tokenization

s6/fix_python

s6/fp8_native

s6/imatrix_conv

s6/list_prompt_cache

s6/mikupad

s6/mla

s6/numa_KV

s6/qwen3_dynamic_yarn

s6/readme-minor1

s6/readme-minor2

s6/readme_update

s6/remove_kv_l

s6/rope_freq_fix

s6/rpc

s6/seed_support2

s6/sweep_bench

s6/sweep_bench_update

s6/termux_fix

s6/warmup

#1

#10

#1000

#1001

#1003

#1004

#1005

#1006

#1007

#1008

#101

#1011

#1012

#1016

#1017

#1018

#102

#1022

#1023

#1024

#1025

#1026

#1027

#1029

#1030

#1031

#1032

#1033

#1034

#1035

#1036

#1037

#1038

#1039

#1040

#1042

#1047

#1048

#1049

#105

#1050

#1051

#1052

#1053

#1054

#1056

#1057

#1058

#1059

#106

#1060

#1061

#1062

#1063

#1064

#1065

#1067

#1068

#1069

#107

#1070

#1071

#1073

#1079

#108

#1080

#1082

#1086

#1087

#1088

#1089

#109

#1091

#1092

#1093

#1094

#1096

#1097

#11

#110

#1100

#1101

#1103

#1104

#1105

#1106

#1107

#111

#1110

#1112

#1114

#1115

#1116

#1118

#1119

#112

#1120

#1121

#1124

#1126

#1128

#1129

#113

#1130

#1131

#1131

#1134

#1135

#1136

#1137

#1138

#1139

#114

#1140

#1141

#1143

#1144

#1147

#115

#1151

#1152

#1153

#1154

#1155

#1156

#116

#1160

#1161

#1164

#1165

#1166

#1168

#117

#1170

#1171

#1172

#1174

#1175

#1176

#1177

#1178

#1179

#118

#1182

#1183

#1184

#1185

#1187

#119

#1190

#1191

#1192

#1193

#1194

#1195

#1196

#1198

#1199

#12

#120

#1202

#1206

#1207

#1208

#121

#1211

#1212

#1213

#1214

#1215

#1216

#1217

#1218

#122

#1220

#1221

#1222

#1223

#1224

#1226

#123

#1231

#1235

#1236

#1238

#1239

#124

#1240

#1241

#1243

#1244

#1249

#125

#1250

#1251

#1252

#1257

#126

#1260

#1261

#1262

#1263

#1266

#1268

#1269

#127

#1270

#1272

#1274

#1275

#1276

#1277

#1278

#1279

#128

#1280

#1283

#1284

#1285

#1286

#1287

#1288

#129

#1292

#1295

#1296

#13

#130

#1300

#1301

#1303

#1304

#1305

#1306

#1307

#1308

#1309

#131

#1310

#1311

#1313

#1314

#1315

#1318

#132

#1320

#1321

#1322

#1326

#1328

#1329

#1330

#1331

#1332

#1333

#1335

#1336

#1337

#1339

#134

#1340

#1345

#1346

#1347

#1349

#135

#1350

#1352

#1354

#1355

#1359

#136

#1361

#1362

#1365

#1366

#1367

#1368

#1369

#137

#1371

#1372

#1373

#1374

#1375

#1376

#1377

#1378

#138

#1386

#1388

#139

#1392

#1393

#1397

#1398

#14

#1400

#1402

#1403

#1404

#1405

#1407

#1408

#141

#1410

#1412

#1413

#1417

#1418

#1419

#142

#1421

#1422

#1423

#1424

#1425

#1426

#1427

#1429

#143

#1430

#1433

#1435

#1436

#1437

#1439

#144

#1440

#1441

#1443

#1444

#1446

#1447

#145

#1450

#1451

#1452

#1454

#1455

#1456

#1458

#1459

#146

#1460

#1462

#1463

#1464

#1466

#1467

#1468

#1469

#147

#1472

#1474

#1475

#1476

#1477

#1479

#148

#1482

#1483

#1485

#149

#1490

#1491

#1492

#1493

#1494

#1496

#1497

#1498

#1499

#150

#1501

#1503

#1504

#1505

#1506

#1508

#151

#1510

#1511

#1512

#1513

#1515

#1516

#1517

#1518

#1519

#152

#1521

#1526

#1527

#153

#1530

#1531

#1535

#1539

#154

#1540

#1542

#1543

#1546

#1547

#1548

#1549

#155

#1550

#1553

#1556

#1558

#1558

#156

#1560

#1561

#1562

#1564

#1565

#1567

#157

#1570

#1571

#1573

#1574

#1577

#1578

#1579

#158

#1581

#1582

#1583

#1584

#1585

#1590

#1592

#1593

#1593

#1595

#1596

#1597

#1598

#1599

#16

#1600

#1601

#1603

#1604

#1606

#1609

#161

#1610

#1615

#1617

#1617

#162

#1625

#1626

#1627

#163

#1633

#1634

#1635

#1637

#1638

#1641

#1644

#1645

#1646

#1647

#1648

#1649

#1651

#1652

#1653

#1654

#1654

#1657

#1659

#1666

#1669

#1672

#1673

#1677

#1679

#168

#1682

#1683

#1686

#1687

#1688

#1689

#169

#1690

#1691

#1696

#1698

#17

#170

#1700

#1701

#1702

#1703

#1704

#1707

#171

#1710

#1713

#1714

#1716

#1717

#1718

#172

#1721

#1722

#1723

#1724

#1726

#1727

#1727

#1728

#1729

#173

#1731

#1732

#1733

#1734

#1735

#1736

#1738

#1738

#174

#1741

#1743

#1744

#1745

#1746

#1746

#1748

#175

#1750

#1753

#1755

#1756

#1757

#1758

#1759

#176

#1760

#1761

#1764

#1764

#1767

#177

#1770

#1771

#1773

#1774

#1776

#1777

#1778

#178

#1780

#1781

#1782

#1783

#1784

#1785

#1786

#1787

#1788

#1789

#179

#1791

#1792

#1794

#1795

#1796

#1797

#1798

#1799

#180

#1800

#1801

#1803

#1804

#1805

#1806

#1808

#1809

#181

#1810

#1813

#1815

#1816

#1817

#1819

#182

#1820

#1821

#1822

#1825

#1826

#1827

#1828

#1830

#1830

#1832

#1834

#1835

#1838

#184

#1840

#1841

#1844

#1846

#1847

#1848

#1849

#1849

#185

#1851

#1852

#1853

#1853

#1854

#1855

#1857

#1858

#186

#1860

#1861

#1862

#1866

#1867

#1869

#187

#1870

#1871

#1872

#1873

#1876

#1877

#1877

#1879

#188

#1880

#1881

#1883

#1884

#1885

#1886

#1887

#1888

#1888

#1889

#189

#1890

#1892

#1892

#1893

#1894

#1895

#1897

#1899

#19

#190

#1901

#1903

#1904

#1906

#1907

#1908

#191

#1911

#1912

#1913

#1914

#1914

#1918

#1919

#192

#1920

#1921

#1922

#1923

#1924

#1925

#1927

#193

#1930

#1933

#1933

#1934

#1935

#1937

#1938

#1939

#194

#1940

#1941

#1942

#1943

#1944

#1945

#1947

#1948

#1949

#195

#1951

#1954

#1955

#1956

#1958

#1960

#1962

#1963

#1964

#1965

#1967

#1968

#1969

#197

#1970

#1972

#1973

#1974

#1976

#1976

#1977

#1979

#198

#1980

#1983

#1985

#1987

#1988

#1989

#1990

#1991

#1992

#1993

#1994

#1995

#1996

#1997

#1998

#1999

#2

#20

#200

#2000

#2001

#2003

#2005

#2007

#2008

#2009

#2010

#2011

#2011

#2012

#2014

#2015

#2016

#2017

#2018

#2019

#202

#2020

#2021

#2022

#2023

#2025

#2026

#2027

#2029

#2031

#2034

#2035

#2036

#2037

#2038

#2039

#204

#2042

#2042

#2044

#2044

#2045

#2045

#2047

#2047

#2048

#2048

#205

#206

#207

#208

#21

#210

#212

#213

#215

#216

#218

#219

#22

#220

#225

#226

#229

#23

#231

#232

#233

#234

#235

#236

#237

#238

#239

#24

#240

#241

#243

#244

#246

#247

#248

#250

#251

#252

#253

#259

#260

#261

#262

#264

#265

#268

#269

#27

#270

#272

#273

#274

#275

#276

#277

#278

#279

#28

#280

#282

#283

#284

#287

#289

#290

#291

#292

#294

#295

#298

#299

#3

#301

#302

#303

#304

#307

#309

#31

#310

#311

#312

#313

#315

#317

#318

#32

#320

#321

#324

#325

#326

#327

#328

#329

#33

#330

#331

#332

#333

#336

#337

#338

#341

#342

#343

#344

#346

#347

#348

#349

#35

#351

#352

#355

#356

#36

#360

#364

#366

#368

#369

#37

#370

#371

#374

#375

#377

#38

#382

#386

#39

#390

#391

#392

#394

#4

#40

#400

#402

#404

#405

#406

#408

#409

#41

#410

#411

#413

#414

#415

#416

#417

#418

#42

#421

#422

#424

#426

#427

#428

#429

#43

#430

#431

#435

#438

#439

#44

#441

#442

#443

#444

#445

#446

#448

#449

#45

#453

#454

#457

#458

#46

#460

#461

#462

#465

#468

#469

#47

#470

#471

#473

#475

#478

#48

#480

#481

#482

#483

#484

#486

#487

#488

#489

#49

#492

#493

#494

#495

#496

#497

#5

#50

#501

#502

#504

#505

#506

#508

#509

#51

#510

#511

#512

#513

#515

#516

#517

#518

#52

#520

#524

#525

#528

#529

#53

#531

#533

#534

#535

#536

#537

#54

#540

#541

#542

#544

#546

#547

#549

#55

#550

#552

#553

#554

#554

#555

#557

#558

#559

#56

#560

#563

#565

#566

#567

#569

#57

#570

#571

#573

#574

#577

#578

#579

#58

#580

#581

#582

#583

#584

#585

#587

#588

#589

#592

#593

#595

#598

#6

#602

#603

#604

#606

#607

#608

#609

#61

#610

#611

#612

#616

#617

#618

#62

#620

#622

#624

#628

#630

#631

#637

#639

#64

#640

#642

#643

#645

#648

#65

#652

#653

#653

#654

#66

#661

#662

#668

#670

#672

#674

#676

#677

#68

#680

#682

#683

#684

#688

#689

#69

#692

#695

#696

#698

#699

#7

#70

#700

#701

#702

#705

#707

#708

#709

#71

#710

#711

#712

#713

#714

#716

#717

#719

#72

#720

#722

#723

#724

#726

#727

#728

#73

#734

#735

#738

#739

#74

#740

#741

#742

#745

#748

#75

#751

#752

#754

#757

#759

#76

#760

#762

#764

#768

#77

#771

#774

#78

#782

#786

#787

#788

#789

#79

#790

#791

#794

#795

#796

#797

#798

#799

#80

#801

#802

#803

#807

#81

#810

#814

#817

#820

#823

#824

#825

#826

#828

#829

#83

#833

#835

#836

#837

#838

#84

#840

#841

#842

#843

#844

#845

#85

#850

#851

#852

#853

#855

#857

#858

#86

#860

#861

#863

#864

#866

#868

#87

#870

#871

#872

#874

#875

#876

#878

#879

#880

#881

#882

#883

#887

#889

#89

#891

#892

#894

#896

#897

#899

#9

#90

#900

#901

#902

#903

#906

#907

#91

#910

#911

#913

#914

#916

#920

#921

#922

#923

#924

#926

#928

#929

#93

#931

#932

#933

#934

#935

#936

#937

#938

#939

#94

#941

#943

#944

#945

#947

#948

#949

#951

#952

#954

#957

#958

#959

#96

#963

#965

#966

#968

#969

#97

#970

#971

#972

#973

#976

#977

#98

#980

#983

#984

#985

#987

#988

#989

#99

#991

#992

#993

#995

#996

#998

#999

t0002

f96eaddba8

Revert DFlash SWA optimization (#2039) main Kawrakow 2026-06-26 11:00:09 +02:00
0440345ba9 Revert DFlash SWA optimization ik/revert_dflash_swa_opt Kawrakow 2026-06-26 08:58:50 +00:00
1255b1e479

Minor DFlash tweaks (#2034) Kawrakow 2026-06-26 10:31:03 +02:00
af62a37acd

Prune examples/llava. Dead code. (#2025) Farmadupe 2026-06-26 07:48:48 +01:00
c713bd599b

llama : fix CPU-only load crash on a CUDA build (device_mem out-of-bounds) (#2037) mb8565 2026-06-26 01:47:19 -05:00
0ffdf509ab

ggml : fix set_rows CPU crash when the destination is F32 (#2038) mb8565 2026-06-26 01:46:26 -05:00
a4e408611d Minor DFlash tweaks ik/dflash_tweaks Kawrakow 2026-06-25 15:10:16 +00:00
b84902d2ad

Split mode graph for dense Qwen35 MTP (#2027) Kawrakow 2026-06-25 11:12:22 +02:00
d3e86a5431

Free raw multimedia data from server_tokens after encoding, as it will never be read again (#2029) Farmadupe 2026-06-25 09:18:32 +01:00
bdf5c081dc

DFlash: enable sliding-window attention for draft models (#2021) Joel Farthing 2026-06-25 02:06:54 -05:00
4553cd0059

cuda : fix MLA flash-attn vec decode for asymmetric K/V head sizes (#2031) mb8565 2026-06-25 01:56:17 -05:00
e1670f6c6c Merge remote-tracking branch 'origin/main' into ik/qwen35_mtp_smgraph ik/qwen35_mtp_smgraph Kawrakow 2026-06-24 16:32:10 +00:00
d5507e33ae

Split mode graph for dense Gemma4 assistant (#2022) Kawrakow 2026-06-24 18:29:32 +02:00
9acd6a4cb2 Split mode graph for dense Qwen35 MTP Kawrakow 2026-06-24 16:21:17 +00:00
1f5828eaa4 It is better to use llama_context pointers as keys ik/g4_assistant_smgraph Kawrakow 2026-06-24 13:53:59 +00:00
de6c2dfdec Compiler warnings Kawrakow 2026-06-24 09:34:58 +00:00
bf23a7599c

Avoid Gemma4 assistant strange tensor name warnings (#2023) Kawrakow 2026-06-24 11:23:22 +02:00
9283af5ed8 Avoid Gemma4 assistant strange tensor name warnings ik/tensor_names Kawrakow 2026-06-24 09:20:41 +00:00
118c82d8ef This works Kawrakow 2026-06-24 08:35:06 +00:00
75a5f6d079 Per model CUDA contexts Kawrakow 2026-06-23 16:03:28 +00:00
3530b65869 WIP: Split mode graph for Gemma4 assistant Kawrakow 2026-06-23 13:34:51 +00:00
7cacf28eec

Fix minor GGML discrepencies (#2016) Nexes the Elder 2026-06-24 09:09:33 +02:00
8686ea708b

chat: Cohere2MoE/North Code: parse unopened thinking under --reasoning off (follow-up to #1968) (#2012) Joel Farthing 2026-06-24 02:04:41 -05:00
5a4fa17947

Load glm-dsa indexer tensors as optional (ggml-org/llama.cpp#24770) (#2017) Yap Sok Ann 2026-06-24 14:04:09 +07:00
997b289d93

jinja: give each for-loop iteration a fresh scope (#2018) Yap Sok Ann 2026-06-24 13:58:36 +07:00
a7d35d51dc

eval-callback : sum over the full tensor, not just the printed slice (#2019) mb8565 2026-06-24 01:57:19 -05:00
befbc0945b

server: variance based checkpoint eviction (#2020) firecoperana 2026-06-24 01:54:07 -05:00
3476dd6a40 server: variance based checkpoint eviction fcp/checkpoint_min_var firecoperana 2026-06-23 20:22:06 -05:00
7ccf1d2095

allow user to use THP for host allocations with GGML_CUDA_HOST_MALLOC_THP (#2010) Farmadupe 2026-06-23 14:13:41 +01:00
2d3ecd5e19

Fix minor CUDA discrepancies (part 2) (#2015) Nexes the Elder 2026-06-23 14:03:22 +02:00
9eaf86a7c7

Fix minor CUDA discrepencies (#2005) Nexes the Elder 2026-06-23 09:37:48 +02:00
69a8336d08

Add native MiniMax-M3 tool call parser (#2008) Jun Yamog 2026-06-23 19:36:02 +12:00
b2b4f66fa0

tests: add Seed-OSS chat template fixture (#2014) Joel Farthing 2026-06-23 02:35:28 -05:00
b47b90d0be

Add Laguna M.1 GGUF support (#2003) empty-quiver 2026-06-22 10:53:10 -04:00
64fceb70bc

DFlash: use persistent FA-ready K/V cache (#1997) Joel Farthing 2026-06-22 09:49:35 -05:00
72440a19fc

on-demand tensor reload (#1989) magikRUKKOLA 2026-06-22 14:36:34 +00:00
6c00e87ac8

cmake: drop ggml-blas.h from GGML_PUBLIC_HEADERS (#2007) a1batross 2026-06-21 10:49:09 +05:00
d47f484d29

Force Gemma4 assistant to be loaded on last GPU (#1999) Kawrakow 2026-06-19 18:17:13 +02:00
8369cf7412

Allow graph reuse for Gemma4 MTP (#1996) Kawrakow 2026-06-19 18:16:53 +02:00
b21653a56f

Fully remove any BLAS remnants (#2001) Kawrakow 2026-06-19 17:26:09 +02:00
3cf0f5468f Also these ik/purge_blas Kawrakow 2026-06-19 15:24:24 +00:00
d30b35cb97 Fully remove any BLAS remnants Kawrakow 2026-06-19 15:14:27 +00:00
e734b76632 Force Gemma4 assistant to be loaded on last GPU ik/gemma4_mtp_last_device Kawrakow 2026-06-19 13:51:11 +00:00
d1692e1951 Allow graph reuse for Gemma4 MTP ik/gemma4_mtp_graph_reuse Kawrakow 2026-06-19 09:34:45 +00:00
4bcfe5b872

Add compatibility for llama.cpp Gemma4 assistant GGUFs (#1995) Kawrakow 2026-06-19 11:24:54 +02:00
25d91dea44 Add compatibility for llama.cpp Gemma4 assistant GGUFs ik/compat_g4_assistant Kawrakow 2026-06-19 07:50:26 +00:00
d5c04c15fd

clean redudance in dflash graph and small logics (#1994) Samuel Oliveira Alves 2026-06-19 04:04:54 -03:00
7321648844

Fix Gemma4 MTP compute graph (#1993) Kawrakow 2026-06-19 09:00:44 +02:00
0d59973e4a

Fix MTP warmup for GLM models (#1992) Kawrakow 2026-06-19 08:59:55 +02:00
b3dfb7858c

AVX VNNI auto-activation for MSVC ; HAVE_VNNI256 path for IQ4_XS_R8 and Qx_0 R4 quants. (#1991) Nexes the Elder 2026-06-18 18:05:19 +02:00
67b0b22760 Fix Gemma4 MTP compute graph ik/fix_gemma4_mtp Kawrakow 2026-06-18 15:51:22 +00:00
2c1dc8781b Fix MTP warmup for GLM models ik/glm_mtp_warmup Kawrakow 2026-06-18 13:15:10 +00:00
3b81f63acd Update AUTHORS Kawrakow 2026-06-18 08:11:41 +00:00
21f918c185

faster ggml_cuda_host_malloc (#1988) Farmadupe 2026-06-18 09:01:34 +01:00
f5e5753c32

Fix Qwen35 mtp warmup (#1987) Kawrakow 2026-06-18 09:03:40 +02:00
71af16a6b7

Fix DFlash oerformance with split mode graph (#1980) Kawrakow 2026-06-17 18:40:02 +02:00
dc81d79cb6 Provide API to gtet the model arch string ik/fix_qwen_mtp_warmup Kawrakow 2026-06-17 16:18:32 +00:00
2ba9c2f404 Cleanup + remove unnecessary crippling performance by not using accept to sample draft token Kawrakow 2026-06-17 16:07:19 +00:00
ded03457a1 Fix Qwen35 MTP warmup Kawrakow 2026-06-17 15:42:27 +00:00
4f220159b8

Fix (Gemma-4 Vision): Correct KQ mask fill for causal models in non-causal flash-attn mode (#1985) gapeleon 2026-06-18 00:52:45 +10:00
5b9c3bbc3b Fix DFlash oerformance with split mode graph ik/dflash_fix_smgraph Kawrakow 2026-06-17 05:46:05 +00:00
71cf84c682 Use hidden state from prev token from qwen mtp SamuelOliveirads 2026-06-16 21:31:59 -03:00
064d23a6f8

Codex CLI Responses Compatibility (#1964) Jun Yamog 2026-06-17 01:28:16 +12:00
d37d92b54c

chat: add Cohere2MoE North Code parser (#1968) Joel Farthing 2026-06-16 08:27:30 -05:00
8420f91ae3

Merge pull request #1977 from ikawrakow/ik/dflash_fix_cpu Kawrakow 2026-06-16 15:26:23 +02:00
6f45163a95 Fix DFlash on the CPU ik/dflash_fix_cpu Kawrakow 2026-06-16 13:22:36 +00:00
f9078e169b

Merge pull request #1970 from SamuelOliveirads/feat/dflash-implementation Kawrakow 2026-06-16 15:07:55 +02:00
11c9935ce8

Merge pull request #1893 from ikawrakow/ik/gemma4_mtmd_blindness Kawrakow 2026-06-16 07:47:37 +02:00
ad24046b51 minor refactor in DFlash kv cache graph SamuelOliveirads 2026-06-15 18:22:56 -03:00
2f524850a1

Merge pull request #1973 from ikawrakow/ik/fattn_mma_gqa_16 Kawrakow 2026-06-15 15:24:01 +02:00
37ea89cabf

Merge pull request #1974 from Nexesenex/fix_muge_crash_minimax_m3 Kawrakow 2026-06-15 15:07:49 +02:00
3c9680fd3c Fix Minimax M3 crash when -muge merges up/gate experts Nexesenex 2026-06-15 14:36:14 +02:00
6be3a488d3 CUDA FA: faster TG when GQA is 16 and head size is 128 ik/fattn_mma_gqa_16 Kawrakow 2026-06-15 11:46:02 +00:00
f81673c7db

Merge pull request #1972 from ikawrakow/ik/minimaxm3_smgraph Kawrakow 2026-06-15 13:44:19 +02:00
e927adc4ad

Merge pull request #1969 from Farmadupe/resize_algo_fix Kawrakow 2026-06-15 13:39:11 +02:00
00d96744de

Merge pull request #1967 from Farmadupe/stb_image_resize2 Kawrakow 2026-06-15 13:38:31 +02:00
1dc4ea938a

Merge pull request #1962 from ikawrakow/ik/fix_1961 Kawrakow 2026-06-15 13:00:27 +02:00
c24d50dd88 Split mode graph for MiniMax-M3 ik/minimaxm3_smgraph Kawrakow 2026-06-15 08:41:34 +00:00
567854aeab

Merge pull request #1963 from jkyamog/minimax-m3-support Kawrakow 2026-06-15 10:16:10 +02:00
c08d194edd Use standard graph helpers for MiniMax-M3 Jun Yamog 2026-06-15 01:57:09 +00:00
c538210e6d Add MiniMax-M3 chat template Jun Yamog 2026-06-15 01:29:13 +00:00
6cae8c7ba2 clean logs SamuelOliveirads 2026-06-14 21:07:57 -03:00
19f08160ad Correct image resize algorithm for all qwens after qwen2vl and gemma4 Thomas Green 2026-06-14 21:57:11 +01:00
574f22b3c7 Replace image resizers with avx2/neon simd impls from stb_img_resize2.h Thomas Green 2026-05-31 06:12:36 +01:00
0d75eee35a remove duplicated code and unnecesary refactor SamuelOliveirads 2026-06-14 16:02:02 -03:00
4f1ec69ae5

Merge pull request #1965 from Nexesenex/fix_q8_0_graph_reduce_type Kawrakow 2026-06-14 16:32:48 +02:00
0fdac83272 Fix Q8_0 graph reduce type Nexesenex 2026-06-14 16:07:36 +02:00
0df00b3b94 Add preliminary MiniMax-M3 support Jun Yamog 2026-06-14 12:23:20 +00:00
c73bfbe9ce Fix #1961 ik/fix_1961 Kawrakow 2026-06-14 07:42:39 +00:00
670a3f6f5b

Merge pull request #1960 from BeccaLabs/fix/rpc-device-init Kawrakow 2026-06-14 08:14:07 +02:00
3b1a0f88d5 Add logging for DFlash statistics and clean up workspace handling SamuelOliveirads 2026-06-13 20:14:08 -03:00
053202f97a fix: initialize rpc_device endpoint and device index before parsing BECCA-Labs 2026-06-13 16:13:44 -05:00
3a1d46c4d1 Merge remote-tracking branch 'origin/main' into feat/dflash-implementation SamuelOliveirads 2026-06-13 17:27:52 -03:00
5f917a64b3

Merge pull request #1958 from ikawrakow/ik/handle_think_no_space Kawrakow 2026-06-12 21:27:23 +02:00
8a38025174

Refactor: Move spec outside server (#1949) Samuel Oliveira Alves 2026-06-12 13:12:39 -03:00
d1339249d7

Cleanup: Unify location of m-rope repacking for token and embd (#1924) Farmadupe 2026-06-12 07:27:50 +01:00
b1eb8bb0a1

server: gate llama_decode_stop() to the active decode (fix queued-cancel cascade) (#1941) Simon Lundell 2026-06-12 08:25:44 +02:00
5fb707d19b

Update docs (#1956) Marian M. 2026-06-12 09:24:22 +03:00
175819b4fb Style ik/handle_think_no_space Kawrakow 2026-06-12 06:19:06 +00:00
3dbc3241b9 Handle forced-open reasoning tag without trailing whitespace Kawrakow 2026-06-12 05:43:11 +00:00