From 869b83bc49752aa862aa5d95509666e517ed1e05 Mon Sep 17 00:00:00 2001 From: dungquixote42 <62397442+dungquixote42@users.noreply.github.com> Date: Fri, 10 Apr 2026 12:22:57 -0400 Subject: [PATCH] Add Unicode allowlist (#1597) * initial commit * cleanup * fix whitelist arg parsing and simplify keyword search state * rename white* to allow* * add vocab_pieces init function, rename update functions, delete accidentally added file * delete temporary bias code * auto-generate fill function with script data inside * deduplicate allowlist unicode rule parsing * minor cleanup * delete unnecessary header * refactor allowlist to support sequential rule sets via keywords * add early exit for zero-rules case * delete accidentally added file --- Makefile | 1 + common/common.cpp | 67 +- common/common.h | 16 + common/sampling.cpp | 10 +- common/sampling.h | 2 + examples/server/server-context.cpp | 160 ++- examples/server/server-context.h | 18 + include/llama.h | 2 + scripts/gen-unicode-script-data.py | 110 ++ src/CMakeLists.txt | 1 + src/llama.cpp | 3 + src/unicode-script-data.cpp | 2005 ++++++++++++++++++++++++++++ src/unicode.h | 2 + 13 files changed, 2393 insertions(+), 4 deletions(-) create mode 100644 scripts/gen-unicode-script-data.py create mode 100644 src/unicode-script-data.cpp diff --git a/Makefile b/Makefile index d5286218..0480fec1 100644 --- a/Makefile +++ b/Makefile @@ -1119,6 +1119,7 @@ src/unicode.o: \ $(CXX) $(CXXFLAGS) -c $< -o $@ src/unicode-data.o: \ + src/unicode-script-data.cpp \ src/unicode-data.cpp \ src/unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/common/common.cpp b/common/common.cpp index d5ce6596..d95e1008 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1698,6 +1698,30 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.banned_n = std::stoi(argv[i]); return true; } + if (arg == "--allowlist-unicode-rule") { + CHECK_ARG + if (params.allow_ruless.size() == 0) { + params.allow_ruless.push_back({}); + } + params.allow_ruless.back().push_back(argparse_allowlist_unicode_rule(argv[i])); + return true; + } + if (arg == "--allowlist-pieces") { + CHECK_ARG + params.allow_pieces.push_back(argv[i]); + return true; + } + if (arg == "--allowlist-keyword") { + CHECK_ARG + params.allow_kws.push_back(argv[i]); + params.allow_ruless.push_back({}); + return true; + } + if (arg == "--allowlist-keyword-delay") { + CHECK_ARG + params.allow_kw_delay = std::stoul(argv[i]); + return true; + } if (arg == "-ld" || arg == "--logdir") { CHECK_ARG params.logdir = argv[i]; @@ -2442,9 +2466,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --top-n-sigma t", "top-n-sigma parmeter (default: %.1f, 0.0 = disabled)", (double)sparams.top_n_sigma}); options.push_back({ "*", " --adaptive-target", "adaptive-p sampling: (default: %.2f, <0.0 = disabled)", (double)sparams.adaptive_target}); options.push_back({ "*", " --adaptive-decay", "adaptive-p sampling: (default: %.2f)", (double)sparams.adaptive_decay}); + options.push_back({ "*", " --adaptive-updt-w-cur", "adaptive-p sampling: (default: %s)", sparams.adaptive_updt_w_cur ? "true" : "false"}); options.push_back({ "*", " --banned-string-file", "file path of the list of banned strings on each line" }); options.push_back({ "*", " --banned-n", "number of tokens banned in the phrase during rewind. -1 means all tokens: (default: %d)",params.banned_n }); - options.push_back({ "*", " --adaptive-updt-w-cur", "adaptive-p sampling: (default: %s)", sparams.adaptive_updt_w_cur ? "true" : "false"}); + options.push_back({ "*", " --allowlist-unicode-rule", + "rule for allowlisting unicode script and/or codepoints. disabled without any rule. format: `LOWER..UPPER,SCRIPT:BIAS`\n" + "if unspecified: LOWER = 0, UPPER = -1(=max), SCRIPT=\"\", BIAS = 0. at least one of LOWER, UPPER, or SCRIPT is required\n" }); + options.push_back({ "*", " --allowlist-pieces", "allowlist each token in argument. inherits max BIAS in --allowlist-unicode-rule. overrides --allowlist-unicode-rule" }); + options.push_back({ "*", " --allowlist-keyword", "keyword to expire earlier allowlist rules if matched during generation. does not affect later rules" }); + options.push_back({ "*", " --allowlist-keyword-delay", + "# tokens to delay matching for the first keyword (default: %zu)", params.allow_kw_delay }); options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); @@ -4557,3 +4588,37 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); } + +// +// Argparse utils +// + +std::tuple argparse_allowlist_unicode_rule(std::string argstr) { + // format: + // LOWER..UPPER,SCRIPT:BIAS + + auto subs = string_split(argstr, ":"); + float bias = subs.size() == 1 ? 0 : std::stof(subs[1]); + + subs = string_split(subs[0], ","); + std::string script = std::all_of(subs.back().begin(), subs.back().end(), [](char c) { + return std::isalpha(c); + }) ? string_lower(subs.back()) : "*"; + if (script == "ascii") { + return { 0x000000, 0x00007F, "*", bias }; + } + + uint32_t first = 0; + uint32_t last = -1; + if ((script == "*") || (subs.size() > 1)) { + subs = string_split(subs.front(), "."); + if (!subs.front().empty()) { + first = std::stoul(subs.front()); + } + if (!subs.back().empty()) { + last = std::stoul(subs.back()); + } + } + + return { std::min(first, last), std::max(first, last), script, bias }; +} diff --git a/common/common.h b/common/common.h index 6c10fdf4..3616da8a 100644 --- a/common/common.h +++ b/common/common.h @@ -288,6 +288,16 @@ struct gpt_params { size_t n_buffer = 0; // number of token buffers for string ban bool can_ban_phrases = true; // whether to ban strings + std::vector>> allow_ruless; + std::vector allow_pieces; // each token to allowlist + std::vector allow_kws; // keywords + size_t allow_kw_delay; // minimum n_decoded before first keyword is active + std::vector kv_overrides; std::vector tensor_buft_overrides; std::vector> offload_policy; @@ -735,3 +745,9 @@ void yaml_dump_non_result_info( const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); std::string string_format(const char* fmt, ...); + +// +// Argparse utils +// + +std::tuple argparse_allowlist_unicode_rule(std::string argstr); diff --git a/common/sampling.cpp b/common/sampling.cpp index 2204d6d5..3c0b7484 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -505,8 +505,14 @@ static llama_token_data_array llama_sampling_prepare_impl( cur.resize(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + if ((ctx_sampling->server_biases != nullptr) && (ctx_sampling->server_biases->size() == n_vocab)) { + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id] + ctx_sampling->server_biases->at(token_id), 0.0f}; + } + } else { + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } } ctx_sampling->cur_p = { cur.data(), cur.size(), false }; diff --git a/common/sampling.h b/common/sampling.h index 7c19c73f..94042fe1 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -134,6 +134,8 @@ struct common_sampler { llama_token_data_array cur_p; // current candidates std::mt19937 rng; + + std::vector* server_biases; }; diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index dcad17ef..81422dc5 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -361,6 +361,7 @@ void server_slot::prompt_load(server_prompt_cache& prompt_cache, const server_to void server_slot::reset() { n_prompt_tokens = 0; + last_gentxt_size = 0; generated_text = ""; truncated = false; stopped_eos = false; @@ -394,6 +395,12 @@ void server_slot::reset() { ban_regex.clear(); ban_regex_ci.clear(); + allow_ruless.clear(); + allow_pieces.clear(); + allow_kws.clear(); + allow_kw_delay = 0; + allow_idx = 0; + // Reset speculative decoding stats n_draft_total = 0; n_draft_accepted = 0; @@ -851,6 +858,19 @@ server_slot* server_context::get_available_slot(const server_task& task) { return ret; } +int32_t server_context::populate_vocab_pieces() { + const int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + if (vocab_pieces.size() == n_vocab) { + return n_vocab; + } + vocab_pieces.clear(); + vocab_pieces.reserve(n_vocab); + for (int32_t id = 0; id < n_vocab; ++id) { + vocab_pieces.push_back(common_token_to_piece(ctx, id, true)); + } + return n_vocab; +} + bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) { slot_params defaults; defaults.speculative = params_base.speculative; @@ -1338,6 +1358,106 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) slot.ban_phrases_bias = json_value(data, "banned_bias", params_base.ban_phrases_bias); slot.banned_n = json_value(data, "banned_n", params_base.banned_n); } + + do // populate allowlist biases + { + // TODO: JSON parsing for rules and keywords + slot.allow_ruless = params_base.allow_ruless; + if (slot.allow_ruless.size() == 0) { + slot.allow_biasess.clear(); + break; + } + slot.allow_kws = params_base.allow_kws; + + slot.allow_pieces = params_base.allow_pieces; + const auto& allowlist_piece_array = data.find("allowlist_piece_array"); + if (allowlist_piece_array != data.end() && allowlist_piece_array->is_array()) { + slot.allow_pieces.clear(); + for (const auto& piece: *allowlist_piece_array) { + if (piece.is_string()) { + slot.allow_pieces.push_back(piece.get()); + } + } + } + + slot.allow_kw_delay = json_value(data, "allowlist_keyword_delay", params_base.allow_kw_delay); + // end of allowlist criteria update + + const int32_t n_vocab = populate_vocab_pieces(); + + std::unordered_set allow_settoken; + for (const auto& piece: slot.allow_pieces) { + for (const auto token: common_tokenize(model, piece, false, true)) { + allow_settoken.insert(token); + } + } + + auto n_rules = slot.allow_ruless.size(); + if (n_rules > slot.allow_kws.size() + 1) { + // one more rules than keyword, last rules do not expire + n_rules = slot.allow_kws.size() + 1; + slot.allow_ruless.resize(n_rules); + } else if (n_rules < slot.allow_kws.size()) { + // every rules expire + slot.allow_kws.resize(n_rules); + } + slot.allow_biasess.resize(n_rules); + + for (size_t i = 0; i < n_rules; ++i) { + const auto& rules = slot.allow_ruless[i]; + if ((i < slot.allow_ruless_prev.size()) && (rules == slot.allow_ruless_prev[i])) { + continue; + } + LLAMA_LOG_DEBUG("%s: allowlist %zu is new\n", __func__, i); + + auto& biases = slot.allow_biasess[i]; + biases.resize(n_vocab); + + std::vector cpts; + std::vector scripts; + for (size_t id = 0; id < n_vocab; ++id) { + const size_t n_cpt = llama_fill_from_utf8(&vocab_pieces[id], &cpts, &scripts); + float bias = -INFINITY; + + // each codepoint must be found in + for (size_t j = 0; j < n_cpt; ++j) { + bool in_rule = false; + + // at least one rule + for (const auto& rule: rules) { + const bool in_range = (std::get<0>(rule) <= cpts[j]) && (cpts[j] <= std::get<1>(rule)); + in_rule = in_range && ((std::get<2>(rule) == "*") || std::get<2>(rule) == scripts[j]); + if (in_rule) { + // earlier rule has higher priority + bias = std::max(bias, std::get<3>(rule)); + break; + } + } + if (!in_rule) { + if ((scripts[j] == "common") || (scripts[j] == "inherited")) { + // for common or inherited codepoints (e.g. whitespace), defer to other codepoints in the token + continue; + } + + // to shadow realm + bias = -INFINITY; + break; + } + } + biases[id] = bias; + } + + float max_bias = -INFINITY; + for (const auto& rule: rules) { + max_bias = std::max(max_bias, std::get<3>(rule)); + } + for (const auto token: allow_settoken) { + biases[token] = max_bias; + } + } + } while (false); + slot.allow_ruless_prev = slot.allow_ruless; + if (llama_model_has_recurrent(llama_get_model(slot.ctx))) { params_base.can_ban_phrases = false; bool do_checkpoint = params_base.ctx_checkpoints_n > 0; @@ -1498,6 +1618,7 @@ bool server_context::process_token(completion_token_output& result, server_slot& slot.sampled = result.tok; // search stop word and delete it + slot.last_gentxt_size = slot.generated_text.size(); slot.generated_text += token_str; slot.has_next_token = true; @@ -1930,6 +2051,16 @@ void server_context::send_embedding(const server_slot& slot, const llama_batch& queue_results.send(std::move(res)); } +void server_context::apply_server_biases(server_slot& slot) { + auto& server_biases = slot.ctx_sampling->server_biases; + + if (slot.allow_idx < slot.allow_biasess.size()) { + server_biases = &slot.allow_biasess[slot.allow_idx]; + } else { + server_biases = nullptr; + } +} + void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs) { server_task task; task.id = id_task; @@ -3422,6 +3553,8 @@ void server_context::speculative_decoding_accept() { size_t n_draft = slot.drafted.size(); + apply_server_biases(slot); + // the accepted tokens from the speculation const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted); @@ -3502,6 +3635,8 @@ void server_context::speculative_decoding_accept() { } common_sampler_review(slot.ctx_sampling, slot.token_buffer.size(), slot.rewind_status); + + update_allowlist_state(slot); } SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int)ids.size() - 1, (int)slot.drafted.size(), slot.n_past); LOG_VERBOSE("speculative decoding result", { @@ -3677,7 +3812,7 @@ inline void rewind_context(server_slot& slot, int32_t ban_pos) { size_t n_keep_cache = 0; if (ban_pos > 0) { n_keep_cache = (size_t)(ban_pos - 1); -} + } if (n_keep_cache > slot.cache_tokens.size()) { n_keep_cache = slot.cache_tokens.size(); @@ -3769,6 +3904,25 @@ void server_context::buffer_and_check_string_ban(server_slot & slot, completion_ } } +void server_context::update_allowlist_state(server_slot& slot) { + const auto& kws = slot.allow_kws; + auto& idx = slot.allow_idx; + if ((slot.allow_kw_delay > slot.n_decoded) || (idx >= kws.size())) { + return; + } + + // search for keyword + auto kw = kws[idx]; + auto pos = slot.generated_text.find(kw, std::max(0, slot.last_gentxt_size - (int32_t)kw.length() + 1)); + while (pos != std::string::npos) { + if (++idx >= kws.size()) { + break; + } + kw = kws[idx]; + pos = slot.generated_text.find(kw, pos + 1); + } +} + void server_context::process_batch_tokens(int32_t & n_batch) { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); @@ -3901,6 +4055,8 @@ void server_context::process_batch_tokens(int32_t & n_batch) { } } + apply_server_biases(slot); + const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, tok_idx); common_sampler_accept(slot.ctx_sampling, ctx, id, true); @@ -3944,6 +4100,8 @@ void server_context::process_batch_tokens(int32_t & n_batch) { common_sampler_review(slot.ctx_sampling, slot.token_buffer.size(), slot.rewind_status); + update_allowlist_state(slot); + slot.i_batch = -1; } if (mtp_warmup_needed && !batch_mtp_hidden_state.empty()) { diff --git a/examples/server/server-context.h b/examples/server/server-context.h index ed7fc4e8..50e05131 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -64,6 +64,7 @@ struct server_slot { server_tokens prompt_tokens; server_tokens cache_tokens; + int32_t last_gentxt_size = 0; std::string generated_text; // idx of draft tokens in the main batch @@ -102,6 +103,15 @@ struct server_slot { int32_t banned_n = 1; std::map> positional_bans; + // allowlist + std::vector>> allow_ruless_prev; + std::vector>> allow_ruless; + std::vector allow_pieces; + std::vector allow_kws; + size_t allow_kw_delay = 0; + std::vector> allow_biasess; + size_t allow_idx = 0; + server_prompt server_cached_prompt; void prompt_save(server_prompt_cache& prompt_cache) const; @@ -222,6 +232,8 @@ struct server_context { std::vector lora_adapters; std::vector control_vectors; + std::vector vocab_pieces; + gpt_params params_base; llama_batch batch; @@ -284,6 +296,8 @@ struct server_context { server_slot* get_available_slot(const server_task& task); + int32_t populate_vocab_pieces(); + bool launch_slot_with_task(server_slot& slot, server_task& task); void kv_cache_clear(); @@ -313,6 +327,8 @@ struct server_context { void send_embedding(const server_slot& slot, const llama_batch& batch); + void apply_server_biases(server_slot& slot); + void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs); void request_cancel(int id_task); @@ -361,6 +377,8 @@ struct server_context { void buffer_and_check_string_ban(server_slot& slot, completion_token_output& result); + void update_allowlist_state(server_slot& slot); + json model_meta() const; // Re-aggregates all active vectors and updates the model state diff --git a/include/llama.h b/include/llama.h index 97117dfb..388caff8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1559,4 +1559,6 @@ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_ #endif // LLAMA_API_INTERNAL +size_t llama_fill_from_utf8(void* utf8, void* cpts, void* scripts); + #endif // LLAMA_H diff --git a/scripts/gen-unicode-script-data.py b/scripts/gen-unicode-script-data.py new file mode 100644 index 00000000..aa16766d --- /dev/null +++ b/scripts/gen-unicode-script-data.py @@ -0,0 +1,110 @@ + +from collections import defaultdict + +import requests + +MAX_CODEPOINTS = 0x110000 + +SCRIPT_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt" + + +res = requests.get(SCRIPT_DATA_URL) +res.raise_for_status() +data = res.content.decode() + +cptL_cptU_script = [] +for line in data.splitlines(): + line = line.split() + if len(line) <= 1 or line[0] == "#": + continue + + cpt = line[0].split("..") + if len(cpt) == 1: + cpt += cpt + cpt_lower, cpt_upper = cpt + + cpt_lower = int(cpt_lower, 16) + if cpt_lower >= MAX_CODEPOINTS: + break + + cpt_upper = int(cpt_upper, 16) + if cpt_upper >= MAX_CODEPOINTS: + break + + assert line[1] == ";" + + script = line[2].lower() + + assert line[3] == "#" + + # categ = line[4] + # assert len(categ) == 2 + + cptL_cptU_script.append([cpt_lower, cpt_upper, script]) + +cptL_cptU_script.sort(key=lambda x: x[0]) # just in case + +# merge neighboring codepoints that belong to same script +im = 0 # merge index +for cpt_lower, cpt_upper, script in cptL_cptU_script[1:]: + if (cptL_cptU_script[im][2] == script) and (cptL_cptU_script[im][1] + 1 == cpt_lower): + cptL_cptU_script[im][1] = cpt_upper + else: + im += 1 + cptL_cptU_script[im] = [cpt_lower, cpt_upper, script] +del cptL_cptU_script[im + 1:] + +def out(line=""): + print(line, end='\n') # noqa + +# Generate 'unicode-script-data.cpp': +# python scripts/gen-unicode-script-data.py > src/unicode-script-data.cpp + +out("""\ +// generated with scripts/gen-unicode-script-data.py + +#include "unicode.h" +#include "unicode-data.h" +""") + +out("""\ +size_t unicode_fill_from_utf8(std::string* utf8, std::vector* dst_cpts, std::vector* dst_scripts) { + if (utf8 == nullptr) { + return 0; + } +""") + +out("static const std::vector unicode_scripts = {") +for _, _, script in cptL_cptU_script: + out(" \"%s\"," % script) +out("};") + +out("static const std::vector unicode_script_lasts = {") +for _, cpt_upper, _ in cptL_cptU_script: + out(" 0x%06X," % cpt_upper) +out("};") + +out("""\ + const auto cpts = unicode_cpts_from_utf8(*utf8); + const size_t n_cpt = cpts.size(); + + std::vector scripts; + scripts.reserve(n_cpt); + + for (const auto& cpt: cpts) { + const auto it = std::lower_bound(unicode_script_lasts.begin(), unicode_script_lasts.end(), cpt); + if (it != unicode_script_lasts.end()) { + scripts.push_back(unicode_scripts[std::distance(unicode_script_lasts.begin(), it)]); + } + } + + if (dst_cpts != nullptr) { + *dst_cpts = cpts; + } + if (dst_scripts != nullptr) { + *dst_scripts = scripts; + } + + return n_cpt; +} +""") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3bf2e1ec..48fe0aba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,6 +60,7 @@ add_library(llama unicode.h unicode.cpp unicode-data.cpp + unicode-script-data.cpp ) target_include_directories(llama PUBLIC . ../include) diff --git a/src/llama.cpp b/src/llama.cpp index 2c3e19eb..6e604aae 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9122,3 +9122,6 @@ void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float ctx->draft_input_hidden_state = hidden_state; } +size_t llama_fill_from_utf8(void* utf8, void* cpts, void* scripts) { + return unicode_fill_from_utf8((std::string*)utf8, (std::vector*)cpts, (std::vector*)scripts); +} diff --git a/src/unicode-script-data.cpp b/src/unicode-script-data.cpp new file mode 100644 index 00000000..f49ef7f2 --- /dev/null +++ b/src/unicode-script-data.cpp @@ -0,0 +1,2005 @@ +// generated with scripts/gen-unicode-script-data.py + +#include "unicode.h" +#include "unicode-data.h" + +size_t unicode_fill_from_utf8(std::string* utf8, std::vector* dst_cpts, std::vector* dst_scripts) { + if (utf8 == nullptr) { + return 0; + } + +static const std::vector unicode_scripts = { + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "bopomofo", + "common", + "inherited", + "greek", + "common", + "greek", + "greek", + "common", + "greek", + "greek", + "common", + "greek", + "common", + "greek", + "greek", + "greek", + "greek", + "coptic", + "greek", + "cyrillic", + "inherited", + "cyrillic", + "armenian", + "armenian", + "armenian", + "hebrew", + "hebrew", + "hebrew", + "arabic", + "common", + "arabic", + "common", + "arabic", + "common", + "arabic", + "common", + "arabic", + "common", + "arabic", + "inherited", + "arabic", + "inherited", + "arabic", + "common", + "arabic", + "syriac", + "syriac", + "syriac", + "arabic", + "thaana", + "nko", + "nko", + "samaritan", + "samaritan", + "mandaic", + "mandaic", + "syriac", + "arabic", + "arabic", + "common", + "arabic", + "devanagari", + "inherited", + "devanagari", + "common", + "devanagari", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "bengali", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gurmukhi", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "gujarati", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "oriya", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "tamil", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "telugu", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "kannada", + "malayalam", + "malayalam", + "malayalam", + "malayalam", + "malayalam", + "malayalam", + "malayalam", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "sinhala", + "thai", + "common", + "thai", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "lao", + "tibetan", + "tibetan", + "tibetan", + "tibetan", + "tibetan", + "tibetan", + "common", + "tibetan", + "myanmar", + "georgian", + "georgian", + "georgian", + "georgian", + "common", + "georgian", + "hangul", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "cherokee", + "cherokee", + "canadian_aboriginal", + "ogham", + "runic", + "common", + "runic", + "tagalog", + "tagalog", + "hanunoo", + "common", + "buhid", + "tagbanwa", + "tagbanwa", + "tagbanwa", + "khmer", + "khmer", + "khmer", + "mongolian", + "common", + "mongolian", + "common", + "mongolian", + "mongolian", + "mongolian", + "canadian_aboriginal", + "limbu", + "limbu", + "limbu", + "limbu", + "limbu", + "tai_le", + "tai_le", + "new_tai_lue", + "new_tai_lue", + "new_tai_lue", + "new_tai_lue", + "khmer", + "buginese", + "buginese", + "tai_tham", + "tai_tham", + "tai_tham", + "tai_tham", + "tai_tham", + "inherited", + "inherited", + "balinese", + "balinese", + "sundanese", + "batak", + "batak", + "lepcha", + "lepcha", + "lepcha", + "ol_chiki", + "cyrillic", + "georgian", + "georgian", + "sundanese", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "latin", + "greek", + "cyrillic", + "latin", + "greek", + "latin", + "greek", + "latin", + "cyrillic", + "latin", + "greek", + "inherited", + "latin", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "greek", + "common", + "inherited", + "common", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "inherited", + "common", + "greek", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "latin", + "common", + "common", + "common", + "common", + "braille", + "common", + "common", + "glagolitic", + "latin", + "coptic", + "coptic", + "georgian", + "georgian", + "georgian", + "tifinagh", + "tifinagh", + "tifinagh", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "cyrillic", + "common", + "han", + "han", + "han", + "common", + "han", + "common", + "han", + "common", + "han", + "inherited", + "hangul", + "common", + "han", + "common", + "hiragana", + "inherited", + "common", + "hiragana", + "common", + "katakana", + "common", + "katakana", + "bopomofo", + "hangul", + "common", + "bopomofo", + "common", + "common", + "katakana", + "hangul", + "common", + "hangul", + "common", + "katakana", + "common", + "katakana", + "common", + "han", + "common", + "han", + "yi", + "yi", + "lisu", + "vai", + "cyrillic", + "bamum", + "common", + "latin", + "common", + "latin", + "latin", + "syloti_nagri", + "common", + "phags_pa", + "saurashtra", + "saurashtra", + "devanagari", + "kayah_li", + "common", + "kayah_li", + "rejang", + "rejang", + "hangul", + "javanese", + "common", + "javanese", + "javanese", + "myanmar", + "cham", + "cham", + "cham", + "cham", + "myanmar", + "tai_viet", + "tai_viet", + "meetei_mayek", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "latin", + "common", + "latin", + "greek", + "latin", + "common", + "cherokee", + "meetei_mayek", + "meetei_mayek", + "hangul", + "hangul", + "hangul", + "han", + "han", + "latin", + "armenian", + "hebrew", + "hebrew", + "hebrew", + "hebrew", + "hebrew", + "hebrew", + "arabic", + "common", + "arabic", + "arabic", + "inherited", + "common", + "inherited", + "cyrillic", + "common", + "common", + "common", + "arabic", + "arabic", + "common", + "common", + "latin", + "common", + "latin", + "common", + "katakana", + "common", + "katakana", + "common", + "hangul", + "hangul", + "hangul", + "hangul", + "hangul", + "common", + "common", + "common", + "linear_b", + "linear_b", + "linear_b", + "linear_b", + "linear_b", + "linear_b", + "linear_b", + "common", + "common", + "common", + "greek", + "common", + "greek", + "common", + "inherited", + "lycian", + "carian", + "inherited", + "common", + "old_italic", + "old_italic", + "gothic", + "old_permic", + "ugaritic", + "ugaritic", + "old_persian", + "old_persian", + "deseret", + "shavian", + "osmanya", + "osmanya", + "osage", + "osage", + "elbasan", + "caucasian_albanian", + "caucasian_albanian", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "vithkuqi", + "todhri", + "linear_a", + "linear_a", + "linear_a", + "latin", + "latin", + "latin", + "cypriot", + "cypriot", + "cypriot", + "cypriot", + "cypriot", + "cypriot", + "imperial_aramaic", + "imperial_aramaic", + "palmyrene", + "nabataean", + "nabataean", + "hatran", + "hatran", + "hatran", + "phoenician", + "phoenician", + "lydian", + "lydian", + "sidetic", + "meroitic_hieroglyphs", + "meroitic_cursive", + "meroitic_cursive", + "meroitic_cursive", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "kharoshthi", + "old_south_arabian", + "old_north_arabian", + "manichaean", + "manichaean", + "avestan", + "avestan", + "inscriptional_parthian", + "inscriptional_parthian", + "inscriptional_pahlavi", + "inscriptional_pahlavi", + "psalter_pahlavi", + "psalter_pahlavi", + "psalter_pahlavi", + "old_turkic", + "old_hungarian", + "old_hungarian", + "old_hungarian", + "hanifi_rohingya", + "hanifi_rohingya", + "garay", + "garay", + "garay", + "arabic", + "yezidi", + "yezidi", + "yezidi", + "arabic", + "arabic", + "arabic", + "old_sogdian", + "sogdian", + "old_uyghur", + "chorasmian", + "elymaic", + "brahmi", + "brahmi", + "brahmi", + "kaithi", + "kaithi", + "sora_sompeng", + "sora_sompeng", + "chakma", + "chakma", + "mahajani", + "sharada", + "sinhala", + "khojki", + "khojki", + "multani", + "multani", + "multani", + "multani", + "multani", + "khudawadi", + "khudawadi", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "inherited", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "grantha", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "tulu_tigalari", + "newa", + "newa", + "tirhuta", + "tirhuta", + "siddham", + "siddham", + "modi", + "modi", + "mongolian", + "takri", + "takri", + "myanmar", + "ahom", + "ahom", + "ahom", + "dogra", + "warang_citi", + "warang_citi", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "dives_akuru", + "nandinagari", + "nandinagari", + "nandinagari", + "zanabazar_square", + "soyombo", + "canadian_aboriginal", + "pau_cin_hau", + "devanagari", + "sharada", + "sunuwar", + "sunuwar", + "bhaiksuki", + "bhaiksuki", + "bhaiksuki", + "bhaiksuki", + "marchen", + "marchen", + "marchen", + "masaram_gondi", + "masaram_gondi", + "masaram_gondi", + "masaram_gondi", + "masaram_gondi", + "masaram_gondi", + "masaram_gondi", + "gunjala_gondi", + "gunjala_gondi", + "gunjala_gondi", + "gunjala_gondi", + "gunjala_gondi", + "gunjala_gondi", + "tolong_siki", + "tolong_siki", + "makasar", + "kawi", + "kawi", + "kawi", + "lisu", + "tamil", + "tamil", + "cuneiform", + "cuneiform", + "cuneiform", + "cuneiform", + "cypro_minoan", + "egyptian_hieroglyphs", + "egyptian_hieroglyphs", + "anatolian_hieroglyphs", + "gurung_khema", + "bamum", + "mro", + "mro", + "mro", + "tangsa", + "tangsa", + "bassa_vah", + "bassa_vah", + "pahawh_hmong", + "pahawh_hmong", + "pahawh_hmong", + "pahawh_hmong", + "pahawh_hmong", + "kirat_rai", + "medefaidrin", + "beria_erfe", + "beria_erfe", + "miao", + "miao", + "miao", + "tangut", + "nushu", + "han", + "khitan_small_script", + "han", + "tangut", + "khitan_small_script", + "khitan_small_script", + "tangut", + "tangut", + "katakana", + "katakana", + "katakana", + "katakana", + "hiragana", + "katakana", + "hiragana", + "hiragana", + "katakana", + "katakana", + "nushu", + "duployan", + "duployan", + "duployan", + "duployan", + "duployan", + "common", + "common", + "common", + "common", + "common", + "inherited", + "inherited", + "common", + "common", + "common", + "common", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "inherited", + "common", + "greek", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "signwriting", + "signwriting", + "signwriting", + "latin", + "latin", + "glagolitic", + "glagolitic", + "glagolitic", + "glagolitic", + "glagolitic", + "cyrillic", + "cyrillic", + "nyiakeng_puachue_hmong", + "nyiakeng_puachue_hmong", + "nyiakeng_puachue_hmong", + "nyiakeng_puachue_hmong", + "toto", + "wancho", + "wancho", + "nag_mundari", + "ol_onal", + "ol_onal", + "tai_yo", + "tai_yo", + "tai_yo", + "ethiopic", + "ethiopic", + "ethiopic", + "ethiopic", + "mende_kikakui", + "mende_kikakui", + "adlam", + "adlam", + "adlam", + "common", + "common", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "arabic", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "hiragana", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "common", + "han", + "han", + "han", + "han", + "han", + "han", + "han", + "han", + "common", + "common", + "inherited", +}; +static const std::vector unicode_script_lasts = { + 0x000040, + 0x00005A, + 0x000060, + 0x00007A, + 0x0000A9, + 0x0000AA, + 0x0000B9, + 0x0000BA, + 0x0000BF, + 0x0000D6, + 0x0000D7, + 0x0000F6, + 0x0000F7, + 0x0002B8, + 0x0002DF, + 0x0002E4, + 0x0002E9, + 0x0002EB, + 0x0002FF, + 0x00036F, + 0x000373, + 0x000374, + 0x000377, + 0x00037D, + 0x00037E, + 0x00037F, + 0x000384, + 0x000385, + 0x000386, + 0x000387, + 0x00038A, + 0x00038C, + 0x0003A1, + 0x0003E1, + 0x0003EF, + 0x0003FF, + 0x000484, + 0x000486, + 0x00052F, + 0x000556, + 0x00058A, + 0x00058F, + 0x0005C7, + 0x0005EA, + 0x0005F4, + 0x000604, + 0x000605, + 0x00060B, + 0x00060C, + 0x00061A, + 0x00061B, + 0x00061E, + 0x00061F, + 0x00063F, + 0x000640, + 0x00064A, + 0x000655, + 0x00066F, + 0x000670, + 0x0006DC, + 0x0006DD, + 0x0006FF, + 0x00070D, + 0x00074A, + 0x00074F, + 0x00077F, + 0x0007B1, + 0x0007FA, + 0x0007FF, + 0x00082D, + 0x00083E, + 0x00085B, + 0x00085E, + 0x00086A, + 0x000891, + 0x0008E1, + 0x0008E2, + 0x0008FF, + 0x000950, + 0x000954, + 0x000963, + 0x000965, + 0x00097F, + 0x000983, + 0x00098C, + 0x000990, + 0x0009A8, + 0x0009B0, + 0x0009B2, + 0x0009B9, + 0x0009C4, + 0x0009C8, + 0x0009CE, + 0x0009D7, + 0x0009DD, + 0x0009E3, + 0x0009FE, + 0x000A03, + 0x000A0A, + 0x000A10, + 0x000A28, + 0x000A30, + 0x000A33, + 0x000A36, + 0x000A39, + 0x000A3C, + 0x000A42, + 0x000A48, + 0x000A4D, + 0x000A51, + 0x000A5C, + 0x000A5E, + 0x000A76, + 0x000A83, + 0x000A8D, + 0x000A91, + 0x000AA8, + 0x000AB0, + 0x000AB3, + 0x000AB9, + 0x000AC5, + 0x000AC9, + 0x000ACD, + 0x000AD0, + 0x000AE3, + 0x000AF1, + 0x000AFF, + 0x000B03, + 0x000B0C, + 0x000B10, + 0x000B28, + 0x000B30, + 0x000B33, + 0x000B39, + 0x000B44, + 0x000B48, + 0x000B4D, + 0x000B57, + 0x000B5D, + 0x000B63, + 0x000B77, + 0x000B83, + 0x000B8A, + 0x000B90, + 0x000B95, + 0x000B9A, + 0x000B9C, + 0x000B9F, + 0x000BA4, + 0x000BAA, + 0x000BB9, + 0x000BC2, + 0x000BC8, + 0x000BCD, + 0x000BD0, + 0x000BD7, + 0x000BFA, + 0x000C0C, + 0x000C10, + 0x000C28, + 0x000C39, + 0x000C44, + 0x000C48, + 0x000C4D, + 0x000C56, + 0x000C5A, + 0x000C5D, + 0x000C63, + 0x000C6F, + 0x000C7F, + 0x000C8C, + 0x000C90, + 0x000CA8, + 0x000CB3, + 0x000CB9, + 0x000CC4, + 0x000CC8, + 0x000CCD, + 0x000CD6, + 0x000CDE, + 0x000CE3, + 0x000CEF, + 0x000CF3, + 0x000D0C, + 0x000D10, + 0x000D44, + 0x000D48, + 0x000D4F, + 0x000D63, + 0x000D7F, + 0x000D83, + 0x000D96, + 0x000DB1, + 0x000DBB, + 0x000DBD, + 0x000DC6, + 0x000DCA, + 0x000DD4, + 0x000DD6, + 0x000DDF, + 0x000DEF, + 0x000DF4, + 0x000E3A, + 0x000E3F, + 0x000E5B, + 0x000E82, + 0x000E84, + 0x000E8A, + 0x000EA3, + 0x000EA5, + 0x000EBD, + 0x000EC4, + 0x000EC6, + 0x000ECE, + 0x000ED9, + 0x000EDF, + 0x000F47, + 0x000F6C, + 0x000F97, + 0x000FBC, + 0x000FCC, + 0x000FD4, + 0x000FD8, + 0x000FDA, + 0x00109F, + 0x0010C5, + 0x0010C7, + 0x0010CD, + 0x0010FA, + 0x0010FB, + 0x0010FF, + 0x0011FF, + 0x001248, + 0x00124D, + 0x001256, + 0x001258, + 0x00125D, + 0x001288, + 0x00128D, + 0x0012B0, + 0x0012B5, + 0x0012BE, + 0x0012C0, + 0x0012C5, + 0x0012D6, + 0x001310, + 0x001315, + 0x00135A, + 0x00137C, + 0x001399, + 0x0013F5, + 0x0013FD, + 0x00167F, + 0x00169C, + 0x0016EA, + 0x0016ED, + 0x0016F8, + 0x001715, + 0x00171F, + 0x001734, + 0x001736, + 0x001753, + 0x00176C, + 0x001770, + 0x001773, + 0x0017DD, + 0x0017E9, + 0x0017F9, + 0x001801, + 0x001803, + 0x001804, + 0x001805, + 0x001819, + 0x001878, + 0x0018AA, + 0x0018F5, + 0x00191E, + 0x00192B, + 0x00193B, + 0x001940, + 0x00194F, + 0x00196D, + 0x001974, + 0x0019AB, + 0x0019C9, + 0x0019DA, + 0x0019DF, + 0x0019FF, + 0x001A1B, + 0x001A1F, + 0x001A5E, + 0x001A7C, + 0x001A89, + 0x001A99, + 0x001AAD, + 0x001ADD, + 0x001AEB, + 0x001B4C, + 0x001B7F, + 0x001BBF, + 0x001BF3, + 0x001BFF, + 0x001C37, + 0x001C49, + 0x001C4F, + 0x001C7F, + 0x001C8A, + 0x001CBA, + 0x001CBF, + 0x001CC7, + 0x001CD2, + 0x001CD3, + 0x001CE0, + 0x001CE1, + 0x001CE8, + 0x001CEC, + 0x001CED, + 0x001CF3, + 0x001CF4, + 0x001CF7, + 0x001CF9, + 0x001CFA, + 0x001D25, + 0x001D2A, + 0x001D2B, + 0x001D5C, + 0x001D61, + 0x001D65, + 0x001D6A, + 0x001D77, + 0x001D78, + 0x001DBE, + 0x001DBF, + 0x001DFF, + 0x001EFF, + 0x001F15, + 0x001F1D, + 0x001F45, + 0x001F4D, + 0x001F57, + 0x001F59, + 0x001F5B, + 0x001F5D, + 0x001F7D, + 0x001FB4, + 0x001FC4, + 0x001FD3, + 0x001FDB, + 0x001FEF, + 0x001FF4, + 0x001FFE, + 0x00200B, + 0x00200D, + 0x002064, + 0x002070, + 0x002071, + 0x00207E, + 0x00207F, + 0x00208E, + 0x00209C, + 0x0020C1, + 0x0020F0, + 0x002125, + 0x002126, + 0x002129, + 0x00212B, + 0x002131, + 0x002132, + 0x00214D, + 0x00214E, + 0x00215F, + 0x002188, + 0x00218B, + 0x002429, + 0x00244A, + 0x0027FF, + 0x0028FF, + 0x002B73, + 0x002BFF, + 0x002C5F, + 0x002C7F, + 0x002CF3, + 0x002CFF, + 0x002D25, + 0x002D27, + 0x002D2D, + 0x002D67, + 0x002D70, + 0x002D7F, + 0x002D96, + 0x002DA6, + 0x002DAE, + 0x002DB6, + 0x002DBE, + 0x002DC6, + 0x002DCE, + 0x002DD6, + 0x002DDE, + 0x002DFF, + 0x002E5D, + 0x002E99, + 0x002EF3, + 0x002FD5, + 0x003004, + 0x003005, + 0x003006, + 0x003007, + 0x003020, + 0x003029, + 0x00302D, + 0x00302F, + 0x003037, + 0x00303B, + 0x00303F, + 0x003096, + 0x00309A, + 0x00309C, + 0x00309F, + 0x0030A0, + 0x0030FA, + 0x0030FC, + 0x0030FF, + 0x00312F, + 0x00318E, + 0x00319F, + 0x0031BF, + 0x0031E5, + 0x0031EF, + 0x0031FF, + 0x00321E, + 0x00325F, + 0x00327E, + 0x0032CF, + 0x0032FE, + 0x0032FF, + 0x003357, + 0x0033FF, + 0x004DBF, + 0x004DFF, + 0x009FFF, + 0x00A48C, + 0x00A4C6, + 0x00A4FF, + 0x00A62B, + 0x00A69F, + 0x00A6F7, + 0x00A721, + 0x00A787, + 0x00A78A, + 0x00A7DC, + 0x00A7FF, + 0x00A82C, + 0x00A839, + 0x00A877, + 0x00A8C5, + 0x00A8D9, + 0x00A8FF, + 0x00A92D, + 0x00A92E, + 0x00A92F, + 0x00A953, + 0x00A95F, + 0x00A97C, + 0x00A9CD, + 0x00A9CF, + 0x00A9D9, + 0x00A9DF, + 0x00A9FE, + 0x00AA36, + 0x00AA4D, + 0x00AA59, + 0x00AA5F, + 0x00AA7F, + 0x00AAC2, + 0x00AADF, + 0x00AAF6, + 0x00AB06, + 0x00AB0E, + 0x00AB16, + 0x00AB26, + 0x00AB2E, + 0x00AB5A, + 0x00AB5B, + 0x00AB64, + 0x00AB65, + 0x00AB69, + 0x00AB6B, + 0x00ABBF, + 0x00ABED, + 0x00ABF9, + 0x00D7A3, + 0x00D7C6, + 0x00D7FB, + 0x00FA6D, + 0x00FAD9, + 0x00FB06, + 0x00FB17, + 0x00FB36, + 0x00FB3C, + 0x00FB3E, + 0x00FB41, + 0x00FB44, + 0x00FB4F, + 0x00FD3D, + 0x00FD3F, + 0x00FDCF, + 0x00FDFF, + 0x00FE0F, + 0x00FE19, + 0x00FE2D, + 0x00FE2F, + 0x00FE52, + 0x00FE66, + 0x00FE6B, + 0x00FE74, + 0x00FEFC, + 0x00FEFF, + 0x00FF20, + 0x00FF3A, + 0x00FF40, + 0x00FF5A, + 0x00FF65, + 0x00FF6F, + 0x00FF70, + 0x00FF9D, + 0x00FF9F, + 0x00FFBE, + 0x00FFC7, + 0x00FFCF, + 0x00FFD7, + 0x00FFDC, + 0x00FFE6, + 0x00FFEE, + 0x00FFFD, + 0x01000B, + 0x010026, + 0x01003A, + 0x01003D, + 0x01004D, + 0x01005D, + 0x0100FA, + 0x010102, + 0x010133, + 0x01013F, + 0x01018E, + 0x01019C, + 0x0101A0, + 0x0101FC, + 0x0101FD, + 0x01029C, + 0x0102D0, + 0x0102E0, + 0x0102FB, + 0x010323, + 0x01032F, + 0x01034A, + 0x01037A, + 0x01039D, + 0x01039F, + 0x0103C3, + 0x0103D5, + 0x01044F, + 0x01047F, + 0x01049D, + 0x0104A9, + 0x0104D3, + 0x0104FB, + 0x010527, + 0x010563, + 0x01056F, + 0x01057A, + 0x01058A, + 0x010592, + 0x010595, + 0x0105A1, + 0x0105B1, + 0x0105B9, + 0x0105BC, + 0x0105F3, + 0x010736, + 0x010755, + 0x010767, + 0x010785, + 0x0107B0, + 0x0107BA, + 0x010805, + 0x010808, + 0x010835, + 0x010838, + 0x01083C, + 0x01083F, + 0x010855, + 0x01085F, + 0x01087F, + 0x01089E, + 0x0108AF, + 0x0108F2, + 0x0108F5, + 0x0108FF, + 0x01091B, + 0x01091F, + 0x010939, + 0x01093F, + 0x010959, + 0x01099F, + 0x0109B7, + 0x0109CF, + 0x0109FF, + 0x010A03, + 0x010A06, + 0x010A13, + 0x010A17, + 0x010A35, + 0x010A3A, + 0x010A48, + 0x010A58, + 0x010A7F, + 0x010A9F, + 0x010AE6, + 0x010AF6, + 0x010B35, + 0x010B3F, + 0x010B55, + 0x010B5F, + 0x010B72, + 0x010B7F, + 0x010B91, + 0x010B9C, + 0x010BAF, + 0x010C48, + 0x010CB2, + 0x010CF2, + 0x010CFF, + 0x010D27, + 0x010D39, + 0x010D65, + 0x010D85, + 0x010D8F, + 0x010E7E, + 0x010EA9, + 0x010EAD, + 0x010EB1, + 0x010EC7, + 0x010ED8, + 0x010EFF, + 0x010F27, + 0x010F59, + 0x010F89, + 0x010FCB, + 0x010FF6, + 0x01104D, + 0x011075, + 0x01107F, + 0x0110C2, + 0x0110CD, + 0x0110E8, + 0x0110F9, + 0x011134, + 0x011147, + 0x011176, + 0x0111DF, + 0x0111F4, + 0x011211, + 0x011241, + 0x011286, + 0x011288, + 0x01128D, + 0x01129D, + 0x0112A9, + 0x0112EA, + 0x0112F9, + 0x011303, + 0x01130C, + 0x011310, + 0x011328, + 0x011330, + 0x011333, + 0x011339, + 0x01133B, + 0x011344, + 0x011348, + 0x01134D, + 0x011350, + 0x011357, + 0x011363, + 0x01136C, + 0x011374, + 0x011389, + 0x01138B, + 0x01138E, + 0x0113B5, + 0x0113C0, + 0x0113C2, + 0x0113C5, + 0x0113CA, + 0x0113D5, + 0x0113D8, + 0x0113E2, + 0x01145B, + 0x011461, + 0x0114C7, + 0x0114D9, + 0x0115B5, + 0x0115DD, + 0x011644, + 0x011659, + 0x01166C, + 0x0116B9, + 0x0116C9, + 0x0116E3, + 0x01171A, + 0x01172B, + 0x011746, + 0x01183B, + 0x0118F2, + 0x0118FF, + 0x011906, + 0x011909, + 0x011913, + 0x011916, + 0x011935, + 0x011938, + 0x011946, + 0x011959, + 0x0119A7, + 0x0119D7, + 0x0119E4, + 0x011A47, + 0x011AA2, + 0x011ABF, + 0x011AF8, + 0x011B09, + 0x011B67, + 0x011BE1, + 0x011BF9, + 0x011C08, + 0x011C36, + 0x011C45, + 0x011C6C, + 0x011C8F, + 0x011CA7, + 0x011CB6, + 0x011D06, + 0x011D09, + 0x011D36, + 0x011D3A, + 0x011D3D, + 0x011D47, + 0x011D59, + 0x011D65, + 0x011D68, + 0x011D8E, + 0x011D91, + 0x011D98, + 0x011DA9, + 0x011DDB, + 0x011DE9, + 0x011EF8, + 0x011F10, + 0x011F3A, + 0x011F5A, + 0x011FB0, + 0x011FF1, + 0x011FFF, + 0x012399, + 0x01246E, + 0x012474, + 0x012543, + 0x012FF2, + 0x013455, + 0x0143FA, + 0x014646, + 0x016139, + 0x016A38, + 0x016A5E, + 0x016A69, + 0x016A6F, + 0x016ABE, + 0x016AC9, + 0x016AED, + 0x016AF5, + 0x016B45, + 0x016B59, + 0x016B61, + 0x016B77, + 0x016B8F, + 0x016D79, + 0x016E9A, + 0x016EB8, + 0x016ED3, + 0x016F4A, + 0x016F87, + 0x016F9F, + 0x016FE0, + 0x016FE1, + 0x016FE3, + 0x016FE4, + 0x016FF6, + 0x018AFF, + 0x018CD5, + 0x018CFF, + 0x018D1E, + 0x018DF2, + 0x01AFF3, + 0x01AFFB, + 0x01AFFE, + 0x01B000, + 0x01B11F, + 0x01B122, + 0x01B132, + 0x01B152, + 0x01B155, + 0x01B167, + 0x01B2FB, + 0x01BC6A, + 0x01BC7C, + 0x01BC88, + 0x01BC99, + 0x01BC9F, + 0x01BCA3, + 0x01CCFC, + 0x01CEB3, + 0x01CED0, + 0x01CEF0, + 0x01CF2D, + 0x01CF46, + 0x01CFC3, + 0x01D0F5, + 0x01D126, + 0x01D166, + 0x01D169, + 0x01D17A, + 0x01D182, + 0x01D184, + 0x01D18B, + 0x01D1A9, + 0x01D1AD, + 0x01D1EA, + 0x01D245, + 0x01D2D3, + 0x01D2F3, + 0x01D356, + 0x01D378, + 0x01D454, + 0x01D49C, + 0x01D49F, + 0x01D4A2, + 0x01D4A6, + 0x01D4AC, + 0x01D4B9, + 0x01D4BB, + 0x01D4C3, + 0x01D505, + 0x01D50A, + 0x01D514, + 0x01D51C, + 0x01D539, + 0x01D53E, + 0x01D544, + 0x01D546, + 0x01D550, + 0x01D6A5, + 0x01D7CB, + 0x01D7FF, + 0x01DA8B, + 0x01DA9F, + 0x01DAAF, + 0x01DF1E, + 0x01DF2A, + 0x01E006, + 0x01E018, + 0x01E021, + 0x01E024, + 0x01E02A, + 0x01E06D, + 0x01E08F, + 0x01E12C, + 0x01E13D, + 0x01E149, + 0x01E14F, + 0x01E2AE, + 0x01E2F9, + 0x01E2FF, + 0x01E4F9, + 0x01E5FA, + 0x01E5FF, + 0x01E6DE, + 0x01E6F5, + 0x01E6FF, + 0x01E7E6, + 0x01E7EB, + 0x01E7EE, + 0x01E7FE, + 0x01E8C4, + 0x01E8D6, + 0x01E94B, + 0x01E959, + 0x01E95F, + 0x01ECB4, + 0x01ED3D, + 0x01EE03, + 0x01EE1F, + 0x01EE22, + 0x01EE24, + 0x01EE27, + 0x01EE32, + 0x01EE37, + 0x01EE39, + 0x01EE3B, + 0x01EE42, + 0x01EE47, + 0x01EE49, + 0x01EE4B, + 0x01EE4F, + 0x01EE52, + 0x01EE54, + 0x01EE57, + 0x01EE59, + 0x01EE5B, + 0x01EE5D, + 0x01EE5F, + 0x01EE62, + 0x01EE64, + 0x01EE6A, + 0x01EE72, + 0x01EE77, + 0x01EE7C, + 0x01EE7E, + 0x01EE89, + 0x01EE9B, + 0x01EEA3, + 0x01EEA9, + 0x01EEBB, + 0x01EEF1, + 0x01F02B, + 0x01F093, + 0x01F0AE, + 0x01F0BF, + 0x01F0CF, + 0x01F0F5, + 0x01F1AD, + 0x01F1FF, + 0x01F200, + 0x01F202, + 0x01F23B, + 0x01F248, + 0x01F251, + 0x01F265, + 0x01F6D8, + 0x01F6EC, + 0x01F6FC, + 0x01F7D9, + 0x01F7EB, + 0x01F7F0, + 0x01F80B, + 0x01F847, + 0x01F859, + 0x01F887, + 0x01F8AD, + 0x01F8BB, + 0x01F8C1, + 0x01F8D8, + 0x01FA57, + 0x01FA6D, + 0x01FA7C, + 0x01FA8A, + 0x01FAC6, + 0x01FAC8, + 0x01FADC, + 0x01FAEA, + 0x01FAF8, + 0x01FB92, + 0x01FBFA, + 0x02A6DF, + 0x02B81D, + 0x02CEAD, + 0x02EBE0, + 0x02EE5D, + 0x02FA1D, + 0x03134A, + 0x033479, + 0x0E0001, + 0x0E007F, + 0x0E01EF, +}; + const auto cpts = unicode_cpts_from_utf8(*utf8); + const size_t n_cpt = cpts.size(); + + std::vector scripts; + scripts.reserve(n_cpt); + + for (const auto& cpt: cpts) { + const auto it = std::lower_bound(unicode_script_lasts.begin(), unicode_script_lasts.end(), cpt); + if (it != unicode_script_lasts.end()) { + scripts.push_back(unicode_scripts[std::distance(unicode_script_lasts.begin(), it)]); + } + } + + if (dst_cpts != nullptr) { + *dst_cpts = cpts; + } + if (dst_scripts != nullptr) { + *dst_scripts = scripts; + } + + return n_cpt; +} + diff --git a/src/unicode.h b/src/unicode.h index 1d702d0a..2f59fc2b 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -88,6 +88,8 @@ struct unicode_cpt_flags { } }; +size_t unicode_fill_from_utf8(std::string* utf8, std::vector* dst_cpts, std::vector* dst_scripts); + size_t unicode_len_utf8(char src); std::string unicode_cpt_to_utf8 (uint32_t cpt);