mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
vocab : add tokenizer support for jina-embeddings-v2-base-zh (#18756)
* vocab : add jina-embeddings-v2-base-zh (whitespace tokenizer) * lowercase defaults to true * type fix --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
3292da09f6
commit
d4c8e2c29c
@ -1692,6 +1692,16 @@ class TextModel(ModelBase):
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_whitespace(self) -> None:
|
||||
tokens, toktypes, _ = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("whitespace")
|
||||
self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_hybriddna(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
@ -571,7 +571,16 @@ class JinaBertV2Model(BertModel):
|
||||
if tokenizer_class == 'BertTokenizer':
|
||||
super().set_vocab()
|
||||
elif tokenizer_class == 'RobertaTokenizer':
|
||||
self._set_vocab_gpt2()
|
||||
pre_tokenizer_type = None
|
||||
tokenizer_json_path = self.dir_model / "tokenizer.json"
|
||||
if tokenizer_json_path.is_file():
|
||||
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
|
||||
pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
|
||||
|
||||
if pre_tokenizer_type == "Whitespace":
|
||||
self._set_vocab_whitespace()
|
||||
else:
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_token_type_count(2)
|
||||
else:
|
||||
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
||||
|
||||
@ -268,6 +268,8 @@ class Keys:
|
||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
||||
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
||||
# Normalizer constants
|
||||
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
|
||||
# FIM/Infill special tokens constants
|
||||
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
|
||||
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
|
||||
|
||||
@ -1110,6 +1110,9 @@ class GGUFWriter:
|
||||
|
||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||
|
||||
def add_normalizer_lowercase(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
|
||||
|
||||
def add_eot_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||
|
||||
|
||||
@ -52,6 +52,7 @@ class SpecialVocab:
|
||||
add_special_token: dict[str, bool]
|
||||
special_token_ids: dict[str, int]
|
||||
chat_template: str | Sequence[Mapping[str, str]] | None
|
||||
normalizer_lowercase: bool | None
|
||||
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
@ -64,6 +65,7 @@ class SpecialVocab:
|
||||
self.load_merges = load_merges
|
||||
self.merges = []
|
||||
self.chat_template = None
|
||||
self.normalizer_lowercase = None
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
else:
|
||||
@ -102,6 +104,10 @@ class SpecialVocab:
|
||||
if not quiet:
|
||||
logger.info(f'Setting chat_template to {self.chat_template}')
|
||||
gw.add_chat_template(self.chat_template)
|
||||
if self.normalizer_lowercase is not None:
|
||||
if not quiet:
|
||||
logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
|
||||
gw.add_normalizer_lowercase(self.normalizer_lowercase)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
self._try_load_from_tokenizer_json(path)
|
||||
@ -146,6 +152,24 @@ class SpecialVocab:
|
||||
return
|
||||
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
||||
|
||||
def _parse_normalizer(self, normalizer: dict) -> None:
|
||||
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
|
||||
#
|
||||
# Detects lowercase normalization in three possible formats:
|
||||
# 1. Standalone: {"type": "Lowercase"}
|
||||
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
|
||||
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
|
||||
|
||||
normalizer_type = normalizer.get('type')
|
||||
if normalizer_type == 'Lowercase':
|
||||
self.normalizer_lowercase = True
|
||||
elif normalizer_type == 'BertNormalizer':
|
||||
if 'lowercase' in normalizer:
|
||||
self.normalizer_lowercase = normalizer['lowercase']
|
||||
elif normalizer_type == 'Sequence':
|
||||
for norm in normalizer.get('normalizers', []):
|
||||
self._parse_normalizer(norm)
|
||||
|
||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer = None
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
@ -178,6 +202,9 @@ class SpecialVocab:
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unknown tokenizer merges format")
|
||||
# Parse normalizer configuration (e.g. Lowercase) into metadata
|
||||
if normalizer := tokenizer.get('normalizer'):
|
||||
self._parse_normalizer(normalizer)
|
||||
added_tokens = tokenizer.get('added_tokens', {})
|
||||
else:
|
||||
added_tokens = {}
|
||||
|
||||
@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
|
||||
@ -308,6 +308,7 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
||||
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
|
||||
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||
|
||||
@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_WHITESPACE:
|
||||
// whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh)
|
||||
regex_exprs = {
|
||||
"\\S+",
|
||||
};
|
||||
byte_encode = false;
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@ -1671,6 +1678,35 @@ private:
|
||||
const llama_vocab & vocab;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
|
||||
const bool lowercase = vocab.get_normalizer_lowercase();
|
||||
|
||||
std::string segment;
|
||||
auto flush = [&]() {
|
||||
if (!segment.empty()) {
|
||||
llm_tokenizer_bpe_session::tokenize(segment, output);
|
||||
segment.clear();
|
||||
}
|
||||
};
|
||||
|
||||
for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
|
||||
// drop whitespace
|
||||
if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) {
|
||||
flush();
|
||||
} else {
|
||||
segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
|
||||
}
|
||||
}
|
||||
flush();
|
||||
}
|
||||
|
||||
private:
|
||||
const llama_vocab & vocab;
|
||||
};
|
||||
|
||||
//
|
||||
// impl
|
||||
//
|
||||
@ -1751,6 +1787,7 @@ struct llama_vocab::impl {
|
||||
bool remove_extra_whitespaces = false;
|
||||
bool escape_whitespaces = true;
|
||||
bool treat_whitespace_as_suffix = false;
|
||||
bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json)
|
||||
|
||||
std::unordered_map<std::string, llama_token> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
special_mask_id = 103;
|
||||
|
||||
add_sep = true;
|
||||
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
|
||||
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") {
|
||||
type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "roberta-bpe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
add_sep = true;
|
||||
} else if (
|
||||
tokenizer_pre == "whitespace") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||
@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
@ -3264,6 +3305,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
||||
std::unique_ptr<llm_tokenizer_bpe_session> session;
|
||||
if (vocab.get_tokenizer_model() == "hybriddna") {
|
||||
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
|
||||
} else if (vocab.get_tokenizer_model() == "whitespace") {
|
||||
session = std::make_unique<llm_tokenizer_whitespace_session>(vocab, *tok_bpe);
|
||||
} else {
|
||||
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
|
||||
}
|
||||
@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
|
||||
return pimpl->treat_whitespace_as_suffix;
|
||||
}
|
||||
|
||||
bool llama_vocab::get_normalizer_lowercase() const {
|
||||
return pimpl->normalizer_lowercase;
|
||||
}
|
||||
|
||||
int llama_vocab::max_token_len() const {
|
||||
return pimpl->max_token_len;
|
||||
}
|
||||
|
||||
@ -61,6 +61,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
@ -138,6 +139,7 @@ struct llama_vocab {
|
||||
bool get_remove_extra_whitespaces () const;
|
||||
bool get_escape_whitespaces () const;
|
||||
bool get_treat_whitespace_as_suffix() const;
|
||||
bool get_normalizer_lowercase () const;
|
||||
|
||||
int max_token_len() const;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user