From f4043fec0103872bf4339f6fa18d8b17824d5b6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <1629204+CISC@users.noreply.github.com> Date: Sat, 20 Jun 2026 12:42:36 +0200 Subject: [PATCH] convert : more consistent handling of rope_parameters (#24833) --- conversion/bailingmoe.py | 2 +- conversion/base.py | 8 +++++++- conversion/chatglm.py | 2 +- conversion/deci.py | 2 +- conversion/exaone.py | 6 +++--- conversion/gemma.py | 2 +- conversion/glm.py | 4 ++-- conversion/llama.py | 2 +- conversion/mimo.py | 2 +- conversion/minicpm.py | 16 ++++++---------- conversion/nemotron.py | 7 ++++--- conversion/phi.py | 20 +++++++++----------- conversion/qwen.py | 2 +- conversion/stablelm.py | 2 +- conversion/step3.py | 2 +- 15 files changed, 40 insertions(+), 39 deletions(-) diff --git a/conversion/bailingmoe.py b/conversion/bailingmoe.py index 319ff6dabe..2c6425cb64 100644 --- a/conversion/bailingmoe.py +++ b/conversion/bailingmoe.py @@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel): if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) diff --git a/conversion/base.py b/conversion/base.py index c872bcbb3c..08fd3747c4 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1119,8 +1119,10 @@ class TextModel(ModelBase): rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) + partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True) + original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True) - # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + # Ensure global params are mirrored in rope_parameters if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: if local_rope_theta is not None: self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} @@ -1128,6 +1130,10 @@ class TextModel(ModelBase): self.rope_parameters["rope_theta"] = rope_theta if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: self.rope_parameters["rope_type"] = rope_type + if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None: + self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor + if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None: + self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings @classmethod def __init_subclass__(cls): diff --git a/conversion/chatglm.py b/conversion/chatglm.py index 7e323b8900..801913075d 100644 --- a/conversion/chatglm.py +++ b/conversion/chatglm.py @@ -148,7 +148,7 @@ class ChatGLMModel(TextModel): rope_dim = self.hparams["attention_dim"] else: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_add_bos_token(False) rope_freq = 10000 if "rope_ratio" in self.hparams: diff --git a/conversion/deci.py b/conversion/deci.py index 46d8568c5a..be446eefa6 100644 --- a/conversion/deci.py +++ b/conversion/deci.py @@ -161,7 +161,7 @@ class DeciModel(TextModel): factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/conversion/exaone.py b/conversion/exaone.py index b21f027842..bc4fb3f1b1 100644 --- a/conversion/exaone.py +++ b/conversion/exaone.py @@ -24,7 +24,7 @@ class ExaoneModel(TextModel): assert (hparams["activation_function"] == "silu") - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = self.rope_parameters.get("partial_rotary_factor") rotary_factor = rotary_factor if rotary_factor is not None else 1.0 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) @@ -39,7 +39,7 @@ class ExaoneModel(TextModel): factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -104,7 +104,7 @@ class Exaone4Model(TextModel): factor = rope_params.get("factor", 16.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/conversion/gemma.py b/conversion/gemma.py index 5b4ca5c583..c552df732b 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model): self.gguf_writer.add_head_count_kv(value_arr) # handle n_rot differently for global vs swa layers - partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) + partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0) n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) self.gguf_writer.add_rope_dimension_count(n_rot_full) diff --git a/conversion/glm.py b/conversion/glm.py index 641937720d..895cefc22b 100644 --- a/conversion/glm.py +++ b/conversion/glm.py @@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_rope_dimension_count( - int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)) ) # MoE parameters - Use only routed expert count (shared experts handled separately) @@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model): super().set_gguf_parameters() rope_dim = self.hparams["qk_rope_head_dim"] - partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) + partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0) self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) # NextN/MTP prediction layers diff --git a/conversion/llama.py b/conversion/llama.py index b87bf92d46..a0d39472eb 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -289,7 +289,7 @@ class LlamaModel(TextModel): factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/conversion/mimo.py b/conversion/mimo.py index d4067aab4b..11ec286794 100644 --- a/conversion/mimo.py +++ b/conversion/mimo.py @@ -154,7 +154,7 @@ class MimoV2Model(TextModel): self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) + rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"]) self.gguf_writer.add_rope_dimension_count(rope_dim) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) diff --git a/conversion/minicpm.py b/conversion/minicpm.py index e9a4c4a74d..e31b26a008 100644 --- a/conversion/minicpm.py +++ b/conversion/minicpm.py @@ -32,11 +32,9 @@ class MiniCPMModel(TextModel): def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if long_factors or short_factors: if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') @@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if long_factors or short_factors: rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') diff --git a/conversion/nemotron.py b/conversion/nemotron.py index dfeeb97858..e44688a788 100644 --- a/conversion/nemotron.py +++ b/conversion/nemotron.py @@ -125,17 +125,18 @@ class NemotronModel(TextModel): self.gguf_writer.add_layer_norm_eps(f_norm_eps) # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + rot_pct = self.rope_parameters["partial_rotary_factor"] n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + factor = self.hparams.get("factor") or self.rope_parameters.get("factor") + if factor is None: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) else: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + self.gguf_writer.add_rope_scaling_factor(factor) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side diff --git a/conversion/phi.py b/conversion/phi.py index 5e0d72847a..df4bfe809a 100644 --- a/conversion/phi.py +++ b/conversion/phi.py @@ -18,7 +18,7 @@ class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): - rot_pct = self.find_hparam(["partial_rotary_factor"]) + rot_pct = self.rope_parameters["partial_rotary_factor"] n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) @@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel): n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) rms_eps = self.find_hparam(["rms_norm_eps"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"] + rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0) rope_dims = int(rot_pct * n_embd) // n_head self.gguf_writer.add_context_length(max_pos_embds) @@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"] + rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0) rope_dims = int(rot_pct * n_embd) // n_head # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if not long_factors: return scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() + rope_scaling_type = self.rope_parameters.get('rope_type', '').lower() if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') @@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel): self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') diff --git a/conversion/qwen.py b/conversion/qwen.py index 7eb135c832..6b85eb9aaf 100644 --- a/conversion/qwen.py +++ b/conversion/qwen.py @@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel): self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25))) @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: diff --git a/conversion/stablelm.py b/conversion/stablelm.py index ba5e9aa6ca..6e16378a03 100644 --- a/conversion/stablelm.py +++ b/conversion/stablelm.py @@ -28,7 +28,7 @@ class StableLMModel(TextModel): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + rotary_factor = self.rope_parameters["partial_rotary_factor"] self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) diff --git a/conversion/step3.py b/conversion/step3.py index 8c45b61c95..49bb5244a6 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -314,7 +314,7 @@ class Step35Model(TextModel): factor = float(rope_params.get("factor", 8.0)) low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) - old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) + old_context_len = int(rope_params.get("original_max_position_embeddings", 8192)) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor