convert : more consistent handling of rope_parameters (#24833)

This commit is contained in:
Sigbjørn Skjæret 2026-06-20 12:42:36 +02:00 committed by GitHub
parent f449e05537
commit f4043fec01
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 40 additions and 39 deletions

View File

@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])

View File

@ -1119,8 +1119,10 @@ class TextModel(ModelBase):
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
# Ensure global params are mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
if local_rope_theta is not None:
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
@classmethod
def __init_subclass__(cls):

View File

@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
rope_dim = self.hparams["attention_dim"]
else:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_add_bos_token(False)
rope_freq = 10000
if "rope_ratio" in self.hparams:

View File

@ -161,7 +161,7 @@ class DeciModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
assert (hparams["activation_function"] == "silu")
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
factor = rope_params.get("factor", 16.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
self.gguf_writer.add_head_count_kv(value_arr)
# handle n_rot differently for global vs swa layers
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
self.gguf_writer.add_rope_dimension_count(n_rot_full)

View File

@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
)
# MoE parameters - Use only routed expert count (shared experts handled separately)
@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
super().set_gguf_parameters()
rope_dim = self.hparams["qk_rope_head_dim"]
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
# NextN/MTP prediction layers

View File

@ -289,7 +289,7 @@ class LlamaModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

View File

@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))

View File

@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
rope_dims = self.hparams["qk_rope_head_dim"]
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

View File

@ -125,17 +125,18 @@ class NemotronModel(TextModel):
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
if factor is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
self.gguf_writer.add_rope_scaling_factor(factor)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side

View File

@ -18,7 +18,7 @@ class Phi2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self):
rot_pct = self.find_hparam(["partial_rotary_factor"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
self.gguf_writer.add_context_length(max_pos_embds)
@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if not long_factors:
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

View File

@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:

View File

@ -28,7 +28,7 @@ class StableLMModel(TextModel):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
rotary_factor = self.rope_parameters["partial_rotary_factor"]
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])

View File

@ -314,7 +314,7 @@ class Step35Model(TextModel):
factor = float(rope_params.get("factor", 8.0))
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor