-
Notifications
You must be signed in to change notification settings - Fork 8.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
examples : new program to verify gguf tokenizer parameters
This program verifies that a given gguf model file can tokenize all potential valid characters. Since llama.cpp currently raises an exception when tokenization is not possible[1], this tool helps verifying that valid ascii and utf-8 will always be properly tokenized. [1] #2580
- Loading branch information
Showing
3 changed files
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
set(TARGET tokenizer-verifier) | ||
add_executable(${TARGET} tokenizer-verifier.cpp) | ||
install(TARGETS ${TARGET} RUNTIME) | ||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||
target_compile_features(${TARGET} PRIVATE cxx_std_11) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#include "common.h" | ||
#include "llama.h" | ||
|
||
#include <cstdio> | ||
#include <string> | ||
#include <vector> | ||
|
||
static int unicode_to_utf8(int codepoint, char *dest) { | ||
// https://stackoverflow.com/a/4609989 — who needs iconv? | ||
if (codepoint < 0x80) { | ||
*dest++ = codepoint; | ||
} else if (codepoint < 0x800) { | ||
*dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64; | ||
// we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity | ||
} else if (codepoint < 0x10000) { | ||
*dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64, | ||
*dest++ = 128 + codepoint % 64; | ||
} else if (codepoint < 0x110000) { | ||
*dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64, | ||
*dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64; | ||
} else { | ||
return 1; | ||
} | ||
return 0; | ||
} | ||
|
||
int main(int argc, char **argv) { | ||
if (argc < 2) { | ||
printf("usage: %s MODEL_PATH\n", argv[0]); | ||
return 1; | ||
} | ||
|
||
const char *model_path = argv[1]; | ||
|
||
llama_backend_init(); | ||
|
||
llama_model_params model_params = llama_model_default_params(); | ||
model_params.vocab_only = true; | ||
llama_model *model = llama_load_model_from_file(model_path, model_params); | ||
|
||
std::vector<llama_token> tokens; | ||
|
||
int failed_ascii = 0; | ||
int ascii_max = 127; | ||
for (int c = 0; c <= ascii_max; c++) { | ||
const char prompt[] = {(char)c, '\0'}; | ||
try { | ||
tokens = ::llama_tokenize(model, prompt, false, true); | ||
} catch (...) { | ||
printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c); | ||
failed_ascii += 1; | ||
continue; | ||
} | ||
} | ||
printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max); | ||
|
||
int failed_unicode = 0; | ||
int utf8_max = 0x10FFFF; | ||
// Now let's do all potential codepoints | ||
for (int cp = 0; cp <= utf8_max; cp++) { | ||
char buf[5] = {}; | ||
if (unicode_to_utf8(cp, buf)) { | ||
printf("Impossible to encode codepoint %#x\n", cp); | ||
continue; | ||
} | ||
try { | ||
tokens = ::llama_tokenize(model, buf, false, true); | ||
} catch (...) { | ||
// printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf); | ||
failed_unicode += 1; | ||
continue; | ||
} | ||
} | ||
printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode, | ||
utf8_max); | ||
|
||
return (failed_ascii != 0 || failed_unicode != 0); | ||
} |